<!DOCTYPE html>
<html lang="zh">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 5.3.0">


  <link rel="apple-touch-icon" sizes="180x180" href="/yuwanzi.io/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/yuwanzi.io/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/yuwanzi.io/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/yuwanzi.io/images/logo.svg" color="#222">

<link rel="stylesheet" href="/yuwanzi.io/css/main.css">



<link rel="stylesheet" href="//cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@5.15.1/css/all.min.css">
  <link rel="stylesheet" href="//cdn.jsdelivr.net/npm/animate.css@3.1.1/animate.min.css">

<script class="hexo-configurations">
    var NexT = window.NexT || {};
    var CONFIG = {"hostname":"suyuhuan.gitee.io","root":"/yuwanzi.io/","images":"/yuwanzi.io/images","scheme":"Muse","version":"8.2.0","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12},"copycode":false,"bookmark":{"enable":false,"color":"#222","save":"auto"},"fancybox":false,"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"motion":{"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"fadeInDown","post_body":"fadeInDown","coll_header":"fadeInLeft","sidebar":"fadeInUp"}},"prism":false,"i18n":{"placeholder":"Suche...","empty":"We didn't find any results for the search: ${query}","hits_time":"${hits} results found in ${time} ms","hits":"${hits} results found"}};
  </script>
<meta name="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
<meta property="og:type" content="website">
<meta property="og:title" content="玉丸子 | Blog">
<meta property="og:url" content="https://suyuhuan.gitee.io/yuwanzi.io/page/2/index.html">
<meta property="og:site_name" content="玉丸子 | Blog">
<meta property="og:description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
<meta property="og:locale">
<meta property="article:author" content="玉丸子">
<meta name="twitter:card" content="summary">


<link rel="canonical" href="https://suyuhuan.gitee.io/yuwanzi.io/page/2/">


<script class="page-configurations">
  // https://hexo.io/docs/variables.html
  CONFIG.page = {
    sidebar: "",
    isHome : true,
    isPost : false,
    lang   : 'zh'
  };
</script>
<title>玉丸子 | Blog</title>
  




  <noscript>
  <style>
  body { margin-top: 2rem; }

  .use-motion .menu-item,
  .use-motion .sidebar,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-header {
    visibility: visible;
  }

  .use-motion .header,
  .use-motion .site-brand-container .toggle,
  .use-motion .footer { opacity: initial; }

  .use-motion .site-title,
  .use-motion .site-subtitle,
  .use-motion .custom-logo-image {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line {
    transform: scaleX(1);
  }

  .search-pop-overlay, .sidebar-nav { display: none; }
  .sidebar-panel { display: block; }
  </style>
</noscript>

<link rel="alternate" href="/yuwanzi.io/atom.xml" title="玉丸子 | Blog" type="application/atom+xml">
</head>

<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
  <div class="headband"></div>

  <main class="main">
    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="Navigationsleiste an/ausschalten" role="button">
    </div>
  </div>

  <div class="site-meta">

    <a href="/yuwanzi.io/" class="brand" rel="start">
      <i class="logo-line"></i>
      <h1 class="site-title">玉丸子 | Blog</h1>
      <i class="logo-line"></i>
    </a>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
    </div>
  </div>
</div>







</div>
        
  
  <div class="toggle sidebar-toggle" role="button">
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
  </div>

  <aside class="sidebar">

    <div class="sidebar-inner sidebar-overview-active">
      <ul class="sidebar-nav">
        <li class="sidebar-nav-toc">
          Inhaltsverzeichnis
        </li>
        <li class="sidebar-nav-overview">
          Übersicht
        </li>
      </ul>

      <div class="sidebar-panel-container">
        <!--noindex-->
        <div class="post-toc-wrap sidebar-panel">
        </div>
        <!--/noindex-->

        <div class="site-overview-wrap sidebar-panel">
          <div class="site-author site-overview-item animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
  <p class="site-author-name" itemprop="name">玉丸子</p>
  <div class="site-description" itemprop="description">这里是玉丸子的个人博客,与你一起发现更大的世界。</div>
</div>
<div class="site-state-wrap site-overview-item animated">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/yuwanzi.io/archives">
          <span class="site-state-item-count">68</span>
          <span class="site-state-item-name">Artikel</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
            <a href="/yuwanzi.io/categories/">
        <span class="site-state-item-count">39</span>
        <span class="site-state-item-name">Kategorien</span></a>
      </div>
      <div class="site-state-item site-state-tags">
            <a href="/yuwanzi.io/tags/">
        <span class="site-state-item-count">46</span>
        <span class="site-state-item-name">schlagwörter</span></a>
      </div>
  </nav>
</div>



        </div>
      </div>
    </div>
  </aside>
  <div class="sidebar-dimmer"></div>


    </header>

    
  <div class="back-to-top" role="button">
    <i class="fa fa-arrow-up"></i>
    <span>0%</span>
  </div>

<noscript>
  <div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>


    <div class="main-inner index posts-expand">

    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/11/06/2017-11-06-spring_and_thread-safe/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/11/06/2017-11-06-spring_and_thread-safe/" class="post-title-link" itemprop="url">聊一聊Spring中的线程安全性</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-11-06 12:00:00" itemprop="dateCreated datePublished" datetime="2017-11-06T12:00:00+08:00">2017-11-06</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%90%8E%E7%AB%AF/" itemprop="url" rel="index"><span itemprop="name">后端</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%90%8E%E7%AB%AF/Java/" itemprop="url" rel="index"><span itemprop="name">Java</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%90%8E%E7%AB%AF/Java/Spring/" itemprop="url" rel="index"><span itemprop="name">Spring</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <h3 id="Spring与线程安全"><a href="#Spring与线程安全" class="headerlink" title="Spring与线程安全"></a>Spring与线程安全</h3><hr>
<p>Spring作为一个IOC/DI容器，帮助我们管理了许许多多的“bean”。但其实，<strong>Spring并没有保证这些对象的线程安全，需要由开发者自己编写解决线程安全问题的代码。</strong></p>
<p><strong>Spring对每个bean提供了一个scope属性来表示该bean的作用域</strong>。它是bean的生命周期。例如，一个scope为singleton的bean，在第一次被注入时，会创建为一个单例对象，该对象会一直被复用到应用结束。</p>
<ul>
<li><p>singleton：默认的scope，每个scope为singleton的bean都会被定义为一个单例对象，该对象的生命周期是与Spring IOC容器一致的（但在第一次被注入时才会创建）。</p>
</li>
<li><p>prototype：bean被定义为在每次注入时都会创建一个新的对象。</p>
</li>
<li><p>request：bean被定义为在每个HTTP请求中创建一个单例对象，也就是说在单个请求中都会复用这一个单例对象。</p>
</li>
<li><p>session：bean被定义为在一个session的生命周期内创建一个单例对象。</p>
</li>
<li><p>application：bean被定义为在ServletContext的生命周期中复用一个单例对象。</p>
</li>
<li><p>websocket：bean被定义为在websocket的生命周期中复用一个单例对象。</p>
</li>
</ul>
<p>我们交由Spring管理的大多数对象其实都是一些无状态的对象，这种不会因为多线程而导致状态被破坏的对象很适合Spring的默认scope，<strong>每个单例的无状态对象都是线程安全的（也可以说只要是无状态的对象，不管单例多例都是线程安全的，不过单例毕竟节省了不断创建对象与GC的开销）。</strong></p>
<p><strong>无状态的对象即是自身没有状态的对象，自然也就不会因为多个线程的交替调度而破坏自身状态导致线程安全问题</strong>。无状态对象包括我们经常使用的DO、DTO、VO这些只作为数据的实体模型的贫血对象，还有Service、DAO和Controller，这些对象并没有自己的状态，它们只是用来执行某些操作的。例如，每个DAO提供的函数都只是对数据库的CRUD，而且每个数据库Connection都作为函数的局部变量（局部变量是在用户栈中的，而且用户栈本身就是线程私有的内存区域，所以不存在线程安全问题），用完即关（或交还给连接池）。</p>
<p>有人可能会认为，我使用request作用域不就可以避免每个请求之间的安全问题了吗？这是完全错误的，因为Controller默认是单例的，一个HTTP请求是会被多个线程执行的，这就又回到了线程的安全问题。当然，你也可以把Controller的scope改成prototype，实际上Struts2就是这么做的，但有一点要注意，Spring MVC对请求的拦截粒度是基于每个方法的，而Struts2是基于每个类的，所以把Controller设为多例将会频繁的创建与回收对象，严重影响到了性能。</p>
<p>通过阅读上文其实已经说的很清楚了，<strong>Spring根本就没有对bean的多线程安全问题做出任何保证与措施</strong>。对于每个bean的线程安全问题，根本原因是每个bean自身的设计。<strong>不要在bean中声明任何有状态的实例变量或类变量，如果必须如此，那么就使用ThreadLocal把变量变为线程私有的，如果bean的实例变量或类变量需要在多个线程之间共享，那么就只能使用synchronized、lock、CAS等这些实现线程同步的方法了。</strong></p>
<p>下面将通过解析ThreadLocal的源码来了解它的实现与作用，ThreadLocal是一个很好用的工具类，它在某些情况下解决了线程安全问题（在变量不需要被多个线程共享时）。</p>
<blockquote>
<p>本文作者为<a target="_blank" rel="noopener" href="https://github.com/SylvanasSun">SylvanasSun(sylvanas.sun@gmail.com)</a>，首发于<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun’s Blog</a>。<br>原文链接：<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/11/06/2017-11-06-spring_and_thread-safe/">https://sylvanassun.github.io/2017/11/06/2017-11-06-spring_and_thread-safe/</a><br>（转载请务必保留本段声明，并且保留超链接。）</p>
</blockquote>
<h3 id="ThreadLocal"><a href="#ThreadLocal" class="headerlink" title="ThreadLocal"></a>ThreadLocal</h3><hr>
<p>ThreadLocal是一个为线程提供线程局部变量的工具类。它的思想也十分简单，就是<strong>为线程提供一个线程私有的变量副本</strong>，这样多个线程都可以随意更改自己线程局部的变量，不会影响到其他线程。不过需要注意的是，<strong>ThreadLocal提供的只是一个浅拷贝，如果变量是一个引用类型，那么就要考虑它内部的状态是否会被改变，想要解决这个问题可以通过重写ThreadLocal的initialValue()函数来自己实现深拷贝</strong>，建议在使用ThreadLocal时一开始就重写该函数。</p>
<p>ThreadLocal与像synchronized这样的锁机制是不同的。首先，它们的应用场景与实现思路就不一样，<strong>锁更强调的是如何同步多个线程去正确地共享一个变量，ThreadLocal则是为了解决同一个变量如何不被多个线程共享</strong>。从性能开销的角度上来讲，如果锁机制是用时间换空间的话，那么ThreadLocal就是用空间换时间。</p>
<p>ThreadLocal中含有一个叫做ThreadLocalMap的内部类，该类为一个采用线性探测法实现的HashMap。它的key为ThreadLocal对象而且还使用了WeakReference，ThreadLocalMap正是用来存储变量副本的。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br></pre></td><td class="code"><pre><span class="line">   <span class="comment">/**</span></span><br><span class="line"><span class="comment">    * ThreadLocalMap is a customized hash map suitable only for</span></span><br><span class="line"><span class="comment">    * maintaining thread local values. No operations are exported</span></span><br><span class="line"><span class="comment">    * outside of the ThreadLocal class. The class is package private to</span></span><br><span class="line"><span class="comment">    * allow declaration of fields in class Thread.  To help deal with</span></span><br><span class="line"><span class="comment">    * very large and long-lived usages, the hash table entries use</span></span><br><span class="line"><span class="comment">    * WeakReferences for keys. However, since reference queues are not</span></span><br><span class="line"><span class="comment">    * used, stale entries are guaranteed to be removed only when</span></span><br><span class="line"><span class="comment">    * the table starts running out of space.</span></span><br><span class="line"><span class="comment">    */</span></span><br><span class="line">   <span class="keyword">static</span> <span class="class"><span class="keyword">class</span> <span class="title">ThreadLocalMap</span> </span>&#123;</span><br><span class="line">       <span class="comment">/**</span></span><br><span class="line"><span class="comment">        * The entries in this hash map extend WeakReference, using</span></span><br><span class="line"><span class="comment">        * its main ref field as the key (which is always a</span></span><br><span class="line"><span class="comment">        * ThreadLocal object).  Note that null keys (i.e. entry.get()</span></span><br><span class="line"><span class="comment">        * == null) mean that the key is no longer referenced, so the</span></span><br><span class="line"><span class="comment">        * entry can be expunged from table.  Such entries are referred to</span></span><br><span class="line"><span class="comment">        * as &quot;stale entries&quot; in the code that follows.</span></span><br><span class="line"><span class="comment">        */</span></span><br><span class="line">       <span class="keyword">static</span> <span class="class"><span class="keyword">class</span> <span class="title">Entry</span> <span class="keyword">extends</span> <span class="title">WeakReference</span>&lt;<span class="title">ThreadLocal</span>&lt;?&gt;&gt; </span>&#123;</span><br><span class="line">           <span class="comment">/** The value associated with this ThreadLocal. */</span></span><br><span class="line">           Object value;</span><br><span class="line"></span><br><span class="line">           Entry(ThreadLocal&lt;?&gt; k, Object v) &#123;</span><br><span class="line">               <span class="keyword">super</span>(k);</span><br><span class="line">               value = v;</span><br><span class="line">           &#125;</span><br><span class="line">       &#125;</span><br><span class="line">	....</span><br><span class="line">&#125;	</span><br></pre></td></tr></table></figure>
<p>ThreadLocal中只含有三个成员变量，这三个变量都是与ThreadLocalMap的hash策略相关的。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * ThreadLocals rely on per-thread linear-probe hash maps attached</span></span><br><span class="line"><span class="comment"> * to each thread (Thread.threadLocals and</span></span><br><span class="line"><span class="comment"> * inheritableThreadLocals).  The ThreadLocal objects act as keys,</span></span><br><span class="line"><span class="comment"> * searched via threadLocalHashCode.  This is a custom hash code</span></span><br><span class="line"><span class="comment"> * (useful only within ThreadLocalMaps) that eliminates collisions</span></span><br><span class="line"><span class="comment"> * in the common case where consecutively constructed ThreadLocals</span></span><br><span class="line"><span class="comment"> * are used by the same threads, while remaining well-behaved in</span></span><br><span class="line"><span class="comment"> * less common cases.</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">final</span> <span class="keyword">int</span> threadLocalHashCode = nextHashCode();</span><br><span class="line"></span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * The next hash code to be given out. Updated atomically. Starts at</span></span><br><span class="line"><span class="comment"> * zero.</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> AtomicInteger nextHashCode =</span><br><span class="line">    <span class="keyword">new</span> AtomicInteger();</span><br><span class="line"></span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * The difference between successively generated hash codes - turns</span></span><br><span class="line"><span class="comment"> * implicit sequential thread-local IDs into near-optimally spread</span></span><br><span class="line"><span class="comment"> * multiplicative hash values for power-of-two-sized tables.</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> <span class="keyword">int</span> HASH_INCREMENT = <span class="number">0x61c88647</span>;</span><br><span class="line"></span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * Returns the next hash code.</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="function"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">int</span> <span class="title">nextHashCode</span><span class="params">()</span> </span>&#123;</span><br><span class="line">    <span class="keyword">return</span> nextHashCode.getAndAdd(HASH_INCREMENT);</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>唯一的实例变量threadLocalHashCode是用来进行寻址的hashcode，它由函数nextHashCode()生成，该函数简单地通过一个增量HASH_INCREMENT来生成hashcode。至于为什么这个增量为0x61c88647，主要是因为ThreadLocalMap的初始大小为16，每次扩容都会为原来的2倍，这样它的容量永远为2的n次方，该增量选为0x61c88647也是为了尽可能均匀地分布，减少碰撞冲突。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * The initial capacity -- MUST be a power of two.</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> <span class="keyword">int</span> INITIAL_CAPACITY = <span class="number">16</span>;	</span><br><span class="line"></span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * Construct a new map initially containing (firstKey, firstValue).</span></span><br><span class="line"><span class="comment"> * ThreadLocalMaps are constructed lazily, so we only create</span></span><br><span class="line"><span class="comment"> * one when we have at least one entry to put in it.</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line">ThreadLocalMap(ThreadLocal&lt;?&gt; firstKey, Object firstValue) &#123;</span><br><span class="line">    table = <span class="keyword">new</span> Entry[INITIAL_CAPACITY];</span><br><span class="line">    <span class="keyword">int</span> i = firstKey.threadLocalHashCode &amp; (INITIAL_CAPACITY - <span class="number">1</span>);</span><br><span class="line">    table[i] = <span class="keyword">new</span> Entry(firstKey, firstValue);</span><br><span class="line">    size = <span class="number">1</span>;</span><br><span class="line">    setThreshold(INITIAL_CAPACITY);</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>要获得当前线程私有的变量副本需要调用get()函数。首先，它会调用getMap()函数去获得当前线程的ThreadLocalMap，这个函数需要接收当前线程的实例作为参数。如果得到的ThreadLocalMap为null，那么就去调用setInitialValue()函数来进行初始化，如果不为null，就通过map来获得变量副本并返回。</p>
<p>setInitialValue()函数会去先调用initialValue()函数来生成初始值，该函数默认返回null，我们可以通过重写这个函数来返回我们想要在ThreadLocal中维护的变量。之后，去调用getMap()函数获得ThreadLocalMap，如果该map已经存在，那么就用新获得value去覆盖旧值，否则就调用createMap()函数来创建新的map。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * Returns the value in the current thread&#x27;s copy of this</span></span><br><span class="line"><span class="comment"> * thread-local variable.  If the variable has no value for the</span></span><br><span class="line"><span class="comment"> * current thread, it is first initialized to the value returned</span></span><br><span class="line"><span class="comment"> * by an invocation of the &#123;<span class="doctag">@link</span> #initialValue&#125; method.</span></span><br><span class="line"><span class="comment"> *</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@return</span> the current thread&#x27;s value of this thread-local</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="function"><span class="keyword">public</span> T <span class="title">get</span><span class="params">()</span> </span>&#123;</span><br><span class="line">    Thread t = Thread.currentThread();</span><br><span class="line">    ThreadLocalMap map = getMap(t);</span><br><span class="line">    <span class="keyword">if</span> (map != <span class="keyword">null</span>) &#123;</span><br><span class="line">        ThreadLocalMap.Entry e = map.getEntry(<span class="keyword">this</span>);</span><br><span class="line">        <span class="keyword">if</span> (e != <span class="keyword">null</span>) &#123;</span><br><span class="line">            <span class="meta">@SuppressWarnings(&quot;unchecked&quot;)</span></span><br><span class="line">            T result = (T)e.value;</span><br><span class="line">            <span class="keyword">return</span> result;</span><br><span class="line">        &#125;</span><br><span class="line">    &#125;</span><br><span class="line">    <span class="keyword">return</span> setInitialValue();</span><br><span class="line">&#125;</span><br><span class="line">	</span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * Variant of set() to establish initialValue. Used instead</span></span><br><span class="line"><span class="comment"> * of set() in case user has overridden the set() method.</span></span><br><span class="line"><span class="comment"> *</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@return</span> the initial value</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="function"><span class="keyword">private</span> T <span class="title">setInitialValue</span><span class="params">()</span> </span>&#123;</span><br><span class="line">    T value = initialValue();</span><br><span class="line">    Thread t = Thread.currentThread();</span><br><span class="line">    ThreadLocalMap map = getMap(t);</span><br><span class="line">    <span class="keyword">if</span> (map != <span class="keyword">null</span>)</span><br><span class="line">        map.set(<span class="keyword">this</span>, value);</span><br><span class="line">    <span class="keyword">else</span></span><br><span class="line">        createMap(t, value);</span><br><span class="line">    <span class="keyword">return</span> value;</span><br><span class="line">&#125;</span><br><span class="line">	</span><br><span class="line"><span class="function"><span class="keyword">protected</span> T <span class="title">initialValue</span><span class="params">()</span> </span>&#123;</span><br><span class="line">    <span class="keyword">return</span> <span class="keyword">null</span>;</span><br><span class="line">&#125;	</span><br></pre></td></tr></table></figure>
<p>ThreadLocal的set()与remove()函数要比get()的实现还要简单，都只是通过getMap()来获得ThreadLocalMap然后对其进行操作。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * Sets the current thread&#x27;s copy of this thread-local variable</span></span><br><span class="line"><span class="comment"> * to the specified value.  Most subclasses will have no need to</span></span><br><span class="line"><span class="comment"> * override this method, relying solely on the &#123;<span class="doctag">@link</span> #initialValue&#125;</span></span><br><span class="line"><span class="comment"> * method to set the values of thread-locals.</span></span><br><span class="line"><span class="comment"> *</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@param</span> value the value to be stored in the current thread&#x27;s copy of</span></span><br><span class="line"><span class="comment"> *        this thread-local.</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">set</span><span class="params">(T value)</span> </span>&#123;</span><br><span class="line">    Thread t = Thread.currentThread();</span><br><span class="line">    ThreadLocalMap map = getMap(t);</span><br><span class="line">    <span class="keyword">if</span> (map != <span class="keyword">null</span>)</span><br><span class="line">        map.set(<span class="keyword">this</span>, value);</span><br><span class="line">    <span class="keyword">else</span></span><br><span class="line">        createMap(t, value);</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * Removes the current thread&#x27;s value for this thread-local</span></span><br><span class="line"><span class="comment"> * variable.  If this thread-local variable is subsequently</span></span><br><span class="line"><span class="comment"> * &#123;<span class="doctag">@linkplain</span> #get read&#125; by the current thread, its value will be</span></span><br><span class="line"><span class="comment"> * reinitialized by invoking its &#123;<span class="doctag">@link</span> #initialValue&#125; method,</span></span><br><span class="line"><span class="comment"> * unless its value is &#123;<span class="doctag">@linkplain</span> #set set&#125; by the current thread</span></span><br><span class="line"><span class="comment"> * in the interim.  This may result in multiple invocations of the</span></span><br><span class="line"><span class="comment"> * &#123;<span class="doctag">@code</span> initialValue&#125; method in the current thread.</span></span><br><span class="line"><span class="comment"> *</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@since</span> 1.5</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"> <span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">remove</span><span class="params">()</span> </span>&#123;</span><br><span class="line">     ThreadLocalMap m = getMap(Thread.currentThread());</span><br><span class="line">     <span class="keyword">if</span> (m != <span class="keyword">null</span>)</span><br><span class="line">         m.remove(<span class="keyword">this</span>);</span><br><span class="line"> &#125;</span><br></pre></td></tr></table></figure>
<p>getMap()函数与createMap()函数的实现也十分简单，但是通过观察这两个函数可以发现一个秘密：<strong>ThreadLocalMap是存放在Thread中的。</strong></p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br></pre></td><td class="code"><pre><span class="line">   <span class="comment">/**</span></span><br><span class="line"><span class="comment">    * Get the map associated with a ThreadLocal. Overridden in</span></span><br><span class="line"><span class="comment">    * InheritableThreadLocal.</span></span><br><span class="line"><span class="comment">    *</span></span><br><span class="line"><span class="comment">    * <span class="doctag">@param</span>  t the current thread</span></span><br><span class="line"><span class="comment">    * <span class="doctag">@return</span> the map</span></span><br><span class="line"><span class="comment">    */</span></span><br><span class="line">   <span class="function">ThreadLocalMap <span class="title">getMap</span><span class="params">(Thread t)</span> </span>&#123;</span><br><span class="line">       <span class="keyword">return</span> t.threadLocals;</span><br><span class="line">   &#125;</span><br><span class="line"></span><br><span class="line">   <span class="comment">/**</span></span><br><span class="line"><span class="comment">    * Create the map associated with a ThreadLocal. Overridden in</span></span><br><span class="line"><span class="comment">    * InheritableThreadLocal.</span></span><br><span class="line"><span class="comment">    *</span></span><br><span class="line"><span class="comment">    * <span class="doctag">@param</span> t the current thread</span></span><br><span class="line"><span class="comment">    * <span class="doctag">@param</span> firstValue value for the initial entry of the map</span></span><br><span class="line"><span class="comment">    */</span></span><br><span class="line">   <span class="function"><span class="keyword">void</span> <span class="title">createMap</span><span class="params">(Thread t, T firstValue)</span> </span>&#123;</span><br><span class="line">       t.threadLocals = <span class="keyword">new</span> ThreadLocalMap(<span class="keyword">this</span>, firstValue);</span><br><span class="line">   &#125;</span><br><span class="line"></span><br><span class="line"><span class="comment">// Thread中的源码</span></span><br><span class="line"></span><br><span class="line">   <span class="comment">/* ThreadLocal values pertaining to this thread. This map is maintained</span></span><br><span class="line"><span class="comment">    * by the ThreadLocal class. */</span></span><br><span class="line">   ThreadLocal.ThreadLocalMap threadLocals = <span class="keyword">null</span>;</span><br><span class="line"></span><br><span class="line">   <span class="comment">/*</span></span><br><span class="line"><span class="comment">    * InheritableThreadLocal values pertaining to this thread. This map is</span></span><br><span class="line"><span class="comment">    * maintained by the InheritableThreadLocal class.</span></span><br><span class="line"><span class="comment">    */</span></span><br><span class="line">   ThreadLocal.ThreadLocalMap inheritableThreadLocals = <span class="keyword">null</span>;	</span><br></pre></td></tr></table></figure>
<p>仔细想想其实就能够理解这种设计的思想。有一种普遍的方法是通过一个全局的线程安全的Map来存储各个线程的变量副本，但是这种做法已经完全违背了ThreadLocal的本意，设计ThreadLocal的初衷就是为了避免多个线程去并发访问同一个对象，尽管它是线程安全的。而在每个Thread中存放与它关联的ThreadLocalMap是完全符合ThreadLocal的思想的，当想要对线程局部变量进行操作时，只需要把Thread作为key来获得Thread中的ThreadLocalMap即可。这种设计相比采用一个全局Map的方法会多占用很多内存空间，但也因此不需要额外的采取锁等线程同步方法而节省了时间上的消耗。</p>
<h3 id="ThreadLocal中的内存泄漏"><a href="#ThreadLocal中的内存泄漏" class="headerlink" title="ThreadLocal中的内存泄漏"></a>ThreadLocal中的内存泄漏</h3><hr>
<p>我们要考虑一种会发生内存泄漏的情况，如果ThreadLocal被设置为null后，而且没有任何强引用指向它，根据垃圾回收的可达性分析算法，ThreadLocal将会被回收。这样一来，ThreadLocalMap中就会含有key为null的Entry，而且ThreadLocalMap是在Thread中的，只要线程迟迟不结束，这些无法访问到的value会形成内存泄漏。为了解决这个问题，ThreadLocalMap中的getEntry()、set()和remove()函数都会清理key为null的Entry，以下面的getEntry()函数的源码为例。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br></pre></td><td class="code"><pre><span class="line">     <span class="comment">/**</span></span><br><span class="line"><span class="comment">      * Get the entry associated with key.  This method</span></span><br><span class="line"><span class="comment">      * itself handles only the fast path: a direct hit of existing</span></span><br><span class="line"><span class="comment">      * key. It otherwise relays to getEntryAfterMiss.  This is</span></span><br><span class="line"><span class="comment">      * designed to maximize performance for direct hits, in part</span></span><br><span class="line"><span class="comment">      * by making this method readily inlinable.</span></span><br><span class="line"><span class="comment">      *</span></span><br><span class="line"><span class="comment">      * <span class="doctag">@param</span>  key the thread local object</span></span><br><span class="line"><span class="comment">      * <span class="doctag">@return</span> the entry associated with key, or null if no such</span></span><br><span class="line"><span class="comment">      */</span></span><br><span class="line">     <span class="function"><span class="keyword">private</span> Entry <span class="title">getEntry</span><span class="params">(ThreadLocal&lt;?&gt; key)</span> </span>&#123;</span><br><span class="line">         <span class="keyword">int</span> i = key.threadLocalHashCode &amp; (table.length - <span class="number">1</span>);</span><br><span class="line">         Entry e = table[i];</span><br><span class="line">         <span class="keyword">if</span> (e != <span class="keyword">null</span> &amp;&amp; e.get() == key)</span><br><span class="line">             <span class="keyword">return</span> e;</span><br><span class="line">         <span class="keyword">else</span></span><br><span class="line">             <span class="keyword">return</span> getEntryAfterMiss(key, i, e);</span><br><span class="line">     &#125;</span><br><span class="line"></span><br><span class="line">     <span class="comment">/**</span></span><br><span class="line"><span class="comment">      * Version of getEntry method for use when key is not found in</span></span><br><span class="line"><span class="comment">      * its direct hash slot.</span></span><br><span class="line"><span class="comment">      *</span></span><br><span class="line"><span class="comment">      * <span class="doctag">@param</span>  key the thread local object</span></span><br><span class="line"><span class="comment">      * <span class="doctag">@param</span>  i the table index for key&#x27;s hash code</span></span><br><span class="line"><span class="comment">      * <span class="doctag">@param</span>  e the entry at table[i]</span></span><br><span class="line"><span class="comment">      * <span class="doctag">@return</span> the entry associated with key, or null if no such</span></span><br><span class="line"><span class="comment">      */</span></span><br><span class="line">     <span class="function"><span class="keyword">private</span> Entry <span class="title">getEntryAfterMiss</span><span class="params">(ThreadLocal&lt;?&gt; key, <span class="keyword">int</span> i, Entry e)</span> </span>&#123;</span><br><span class="line">         Entry[] tab = table;</span><br><span class="line">         <span class="keyword">int</span> len = tab.length;</span><br><span class="line"></span><br><span class="line"><span class="comment">// 清理key为null的Entry</span></span><br><span class="line">         <span class="keyword">while</span> (e != <span class="keyword">null</span>) &#123;</span><br><span class="line">             ThreadLocal&lt;?&gt; k = e.get();</span><br><span class="line">             <span class="keyword">if</span> (k == key)</span><br><span class="line">                 <span class="keyword">return</span> e;</span><br><span class="line">             <span class="keyword">if</span> (k == <span class="keyword">null</span>)</span><br><span class="line">                 expungeStaleEntry(i);</span><br><span class="line">             <span class="keyword">else</span></span><br><span class="line">                 i = nextIndex(i, len);</span><br><span class="line">             e = tab[i];</span><br><span class="line">         &#125;</span><br><span class="line">         <span class="keyword">return</span> <span class="keyword">null</span>;</span><br><span class="line">     &#125;</span><br></pre></td></tr></table></figure>
<p>在上文中我们发现了ThreadLocalMap的key是一个弱引用，那么为什么使用弱引用呢？使用强引用key与弱引用key的差别如下：</p>
<ul>
<li><p>强引用key：ThreadLocal被设置为null，由于ThreadLocalMap持有ThreadLocal的强引用，如果不手动删除，那么ThreadLocal将不会回收，产生内存泄漏。</p>
</li>
<li><p>弱引用key：ThreadLocal被设置为null，由于ThreadLocalMap持有ThreadLocal的弱引用，即便不手动删除，ThreadLocal仍会被回收，ThreadLocalMap在之后调用set()、getEntry()和remove()函数时会清除所有key为null的Entry。</p>
</li>
</ul>
<p>但要注意的是，ThreadLocalMap仅仅含有这些被动措施来补救内存泄漏问题。如果你在之后没有调用ThreadLocalMap的set()、getEntry()和remove()函数的话，那么仍然会存在内存泄漏问题。</p>
<p>在使用线程池的情况下，如果不及时进行清理，内存泄漏问题事小，甚至还会产生程序逻辑上的问题。所以，<strong>为了安全地使用ThreadLocal，必须要像每次使用完锁就解锁一样，在每次使用完ThreadLocal后都要调用remove()来清理无用的Entry。</strong></p>
<h3 id="参考文献"><a href="#参考文献" class="headerlink" title="参考文献"></a>参考文献</h3><hr>
<ul>
<li><p><a target="_blank" rel="noopener" href="https://stackoverflow.com/questions/15745140/are-spring-objects-thread-safe">Are Spring objects thread safe? - Stack Overflow</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://tarunsapra.wordpress.com/2011/08/21/spring-singleton-request-session-beans-and-thread-safety/">Spring Singleton, Request, Session Beans and Thread Safety | Java Enterprise Ecosystem.</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://docs.spring.io/spring/docs/current/spring-framework-reference/index.html">Spring Framework Documentation</a></p>
</li>
</ul>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/10/29/2017-10-29-virtual_memory/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/10/29/2017-10-29-virtual_memory/" class="post-title-link" itemprop="url">虚拟内存的那点事儿</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-10-29 18:00:00" itemprop="dateCreated datePublished" datetime="2017-10-29T18:00:00+08:00">2017-10-29</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E8%AE%A1%E7%AE%97%E6%9C%BA/" itemprop="url" rel="index"><span itemprop="name">计算机</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <h3 id="概述"><a href="#概述" class="headerlink" title="概述"></a>概述</h3><hr>
<p>我们都知道一个进程是与其他进程共享CPU和内存资源的。正因如此，操作系统需要有一套完善的内存管理机制才能防止进程之间内存泄漏的问题。</p>
<p>为了更加有效地管理内存并减少出错，现代操作系统提供了一种对主存的抽象概念，即是虚拟内存（Virtual Memory）。<strong>虚拟内存为每个进程提供了一个一致的、私有的地址空间，它让每个进程产生了一种自己在独享主存的错觉（每个进程拥有一片连续完整的内存空间）</strong>。</p>
<p>理解不深刻的人会认为虚拟内存只是“使用硬盘空间来扩展内存“的技术，这是不对的。<strong>虚拟内存的重要意义是它定义了一个连续的虚拟地址空间</strong>，使得程序的编写难度降低。并且，<strong>把内存扩展到硬盘空间只是使用虚拟内存的必然结果，虚拟内存空间会存在硬盘中，并且会被内存缓存（按需），有的操作系统还会在内存不够的情况下，将某一进程的内存全部放入硬盘空间中，并在切换到该进程时再从硬盘读取</strong>（这也是为什么Windows会经常假死的原因…）。</p>
<p>虚拟内存主要提供了如下三个重要的能力：</p>
<ul>
<li><p>它把主存看作为一个存储在硬盘上的虚拟地址空间的高速缓存，并且只在主存中缓存活动区域（按需缓存）。</p>
</li>
<li><p>它为每个进程提供了一个一致的地址空间，从而降低了程序员对内存管理的复杂性。</p>
</li>
<li><p>它还保护了每个进程的地址空间不会被其他进程破坏。</p>
</li>
</ul>
<p>介绍了虚拟内存的基本概念之后，接下来的内容将会从虚拟内存在硬件中如何运作逐渐过渡到虚拟内存在操作系统（Linux）中的实现。</p>
<blockquote>
<p>本文作者为<a target="_blank" rel="noopener" href="https://github.com/SylvanasSun">SylvanasSun(sylvanas.sun@gmail.com)</a>，首发于<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun’s Blog</a>。<br>原文链接：<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/10/29/2017-10-29-virtual_memory/">https://sylvanassun.github.io/2017/10/29/2017-10-29-virtual_memory/</a><br>（转载请务必保留本段声明，并且保留超链接。）</p>
</blockquote>
<h3 id="CPU寻址"><a href="#CPU寻址" class="headerlink" title="CPU寻址"></a>CPU寻址</h3><hr>
<p>内存通常被组织为一个由M个连续的字节大小的单元组成的数组，每个字节都有一个唯一的物理地址（Physical Address PA），作为到数组的索引。CPU访问内存最简单直接的方法就是使用物理地址，这种寻址方式被称为物理寻址。</p>
<p>现代处理器使用的是一种称为虚拟寻址（Virtual  Addressing）的寻址方式。<strong>使用虚拟寻址，CPU需要将虚拟地址翻译成物理地址，这样才能访问到真实的物理内存。</strong></p>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/d/dc/MMU_principle_updated.png" alt="虚拟寻址"></p>
<p>虚拟寻址需要硬件与操作系统之间互相合作。<strong>CPU中含有一个被称为内存管理单元（Memory Management Unit, MMU）的硬件，它的功能是将虚拟地址转换为物理地址。MMU需要借助存放在内存中的页表来动态翻译虚拟地址，该页表由操作系统管理。</strong></p>
<h3 id="页表"><a href="#页表" class="headerlink" title="页表"></a>页表</h3><hr>
<p>虚拟内存空间被组织为一个存放在硬盘上的M个连续的字节大小的单元组成的数组，每个字节都有一个唯一的虚拟地址，作为到数组的索引（这点其实与物理内存是一样的）。</p>
<p><strong>操作系统通过将虚拟内存分割为大小固定的块来作为硬盘和内存之间的传输单位，这个块被称为虚拟页（Virtual Page, VP），每个虚拟页的大小为<code>P=2^p</code>字节。物理内存也会按照这种方法分割为物理页（Physical Page, PP），大小也为<code>P</code>字节。</strong></p>
<p>CPU在获得虚拟地址之后，需要通过MMU将虚拟地址翻译为物理地址。而在翻译的过程中还需要借助页表，所谓<strong>页表就是一个存放在物理内存中的数据结构，它记录了虚拟页与物理页的映射关系。</strong></p>
<p><strong>页表是一个元素为页表条目（Page Table Entry, PTE）的集合，每个虚拟页在页表中一个固定偏移量的位置上都有一个PTE</strong>。下面是PTE仅含有一个有效位标记的页表结构，该有效位代表这个虚拟页是否被缓存在物理内存中。</p>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fkxzl7qsdij20tv0ngjti.jpg"></p>
<p>虚拟页<code>VP 0</code>、<code>VP 4</code>、<code>VP 6</code>、<code>VP 7</code>被缓存在物理内存中，虚拟页<code>VP 2</code>和<code>VP 5</code>被分配在页表中，但并没有缓存在物理内存，虚拟页<code>VP 1</code>和<code>VP 3</code>还没有被分配。</p>
<p>在进行动态内存分配时，例如<code>malloc()</code>函数或者其他高级语言中的<code>new</code>关键字，操作系统会在硬盘中创建或申请一段虚拟内存空间，并更新到页表（分配一个PTE，使该PTE指向硬盘上这个新创建的虚拟页）。</p>
<p><strong>由于CPU每次进行地址翻译的时候都需要经过PTE，所以如果想控制内存系统的访问，可以在PTE上添加一些额外的许可位（例如读写权限、内核权限等）</strong>，这样只要有指令违反了这些许可条件，CPU就会触发一个一般保护故障，将控制权传递给内核中的异常处理程序。一般这种异常被称为“段错误（Segmentation Fault）”。</p>
<h4 id="页命中"><a href="#页命中" class="headerlink" title="页命中"></a>页命中</h4><hr>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fky3k2710cj20xv0ngtax.jpg" alt="页命中"></p>
<p>如上图所示，MMU根据虚拟地址在页表中寻址到了<code>PTE 4</code>，该PTE的有效位为1，代表该虚拟页已经被缓存在物理内存中了，最终MMU得到了PTE中的物理内存地址（指向<code>PP 1</code>）。</p>
<h4 id="缺页"><a href="#缺页" class="headerlink" title="缺页"></a>缺页</h4><hr>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fky3k2krk0j20xv0ngmzd.jpg" alt="缺页"></p>
<p>如上图所示，MMU根据虚拟地址在页表中寻址到了<code>PTE 2</code>，该PTE的有效位为0，代表该虚拟页并没有被缓存在物理内存中。<strong>虚拟页没有被缓存在物理内存中（缓存未命中）被称为缺页。</strong></p>
<p><strong>当CPU遇见缺页时会触发一个缺页异常，缺页异常将控制权转向操作系统内核，然后调用内核中的缺页异常处理程序，该程序会选择一个牺牲页，如果牺牲页已被修改过，内核会先将它复制回硬盘（采用写回机制而不是直写也是为了尽量减少对硬盘的访问次数），然后再把该虚拟页覆盖到牺牲页的位置，并且更新PTE。</strong></p>
<p><strong>当缺页异常处理程序返回时，它会重新启动导致缺页的指令，该指令会把导致缺页的虚拟地址重新发送给MMU</strong>。由于现在已经成功处理了缺页异常，所以最终结果是页命中，并得到物理地址。</p>
<p>这种在硬盘和内存之间传送页的行为称为页面调度（paging）：页从硬盘换入内存和从内存换出到硬盘。当缺页异常发生时，才将页面换入到内存的策略称为按需页面调度（demand paging），所有现代操作系统基本都使用的是按需页面调度的策略。</p>
<p><strong>虚拟内存跟CPU高速缓存（或其他使用缓存的技术）一样依赖于局部性原则</strong>。虽然处理缺页消耗的性能很多（毕竟还是要从硬盘中读取），而且程序在运行过程中引用的不同虚拟页的总数可能会超出物理内存的大小，但是<strong>局部性原则保证了在任意时刻，程序将趋向于在一个较小的活动页面（active page）集合上工作，这个集合被称为工作集（working set）</strong>。根据空间局部性原则（一个被访问过的内存地址以及其周边的内存地址都会有很大几率被再次访问）与时间局部性原则（一个被访问过的内存地址在之后会有很大几率被再次访问），只要将工作集缓存在物理内存中，接下来的地址翻译请求很大几率都在其中，从而减少了额外的硬盘流量。</p>
<p>如果一个程序没有良好的局部性，将会使工作集的大小不断膨胀，直至超过物理内存的大小，这时程序会产生一种叫做抖动（thrashing）的状态，页面会不断地换入换出，如此多次的读写硬盘开销，性能自然会十分“恐怖”。<strong>所以，想要编写出性能高效的程序，首先要保证程序的时间局部性与空间局部性。</strong></p>
<h4 id="多级页表"><a href="#多级页表" class="headerlink" title="多级页表"></a>多级页表</h4><hr>
<p>我们目前为止讨论的只是单页表，但在实际的环境中虚拟空间地址都是很大的（一个32位系统的地址空间有<code>2^32 = 4GB</code>，更别说64位系统了）。在这种情况下，使用一个单页表明显是效率低下的。</p>
<p><strong>常用方法是使用层次结构的页表</strong>。假设我们的环境为一个32位的虚拟地址空间，它有如下形式：</p>
<ul>
<li><p>虚拟地址空间被分为4KB的页，每个PTE都是4字节。</p>
</li>
<li><p>内存的前2K个页面分配给了代码和数据。</p>
</li>
<li><p>之后的6K个页面还未被分配。</p>
</li>
<li><p>再接下来的1023个页面也未分配，其后的1个页面分配给了用户栈。</p>
</li>
</ul>
<p>下图是为该虚拟地址空间构造的二级页表层次结构（真实情况中多为四级或更多），一级页表（1024个PTE正好覆盖4GB的虚拟地址空间，同时每个PTE只有4字节，这样一级页表与二级页表的大小也正好与一个页面的大小一致都为4KB）的每个PTE负责映射虚拟地址空间中一个4MB的片（chunk），每一片都由1024个连续的页面组成。二级页表中的每个PTE负责映射一个4KB的虚拟内存页面。</p>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fkyw24aje1j21060kmac9.jpg"></p>
<p>这个结构看起来很像是一个<code>B-Tree</code>，这种层次结构有效的减缓了内存要求：</p>
<ul>
<li><p>如果一个一级页表的一个PTE是空的，那么相应的二级页表也不会存在。这代表一种巨大的潜在节约（对于一个普通的程序来说，虚拟地址空间的大部分都会是未分配的）。</p>
</li>
<li><p>只有一级页表才总是需要缓存在内存中的，这样虚拟内存系统就可以在需要时创建、页面调入或调出二级页表（只有经常使用的二级页表才会被缓存在内存中），这就减少了内存的压力。</p>
</li>
</ul>
<h3 id="地址翻译的过程"><a href="#地址翻译的过程" class="headerlink" title="地址翻译的过程"></a>地址翻译的过程</h3><hr>
<p>从形式上来说，<strong>地址翻译是一个N元素的虚拟地址空间中的元素和一个M元素的物理地址空间中元素之间的映射。</strong></p>
<p>下图为MMU利用页表进行寻址的过程：</p>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fkz5z2vvi4j20up0fwt9n.jpg"></p>
<p>页表基址寄存器（PTBR）指向当前页表。<strong>一个n位的虚拟地址包含两个部分，一个p位的虚拟页面偏移量（Virtual Page Offset, VPO）和一个（n - p）位的虚拟页号（Virtual Page Number, VPN）。</strong></p>
<p><strong>MMU根据VPN来选择对应的PTE</strong>，例如<code>VPN 0</code>代表<code>PTE 0</code>、<code>VPN 1</code>代表<code>PTE 1</code>….因为物理页与虚拟页的大小是一致的，所以物理页面偏移量（Physical Page Offset, PPO）与VPO是相同的。那么之后<strong>只要将PTE中的物理页号（Physical Page Number, PPN）与虚拟地址中的VPO串联起来，就能得到相应的物理地址</strong>。</p>
<p>多级页表的地址翻译也是如此，只不过因为有多个层次，所以VPN需要分成多段。<strong>假设有一个k级页表，虚拟地址会被分割成k个VPN和1个VPO，每个<code>VPN i </code>都是一个到第i级页表的索引</strong>。为了构造物理地址，MMU需要访问k个PTE才能拿到对应的PPN。</p>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fkz5z434u5j20ng0f0weo.jpg"></p>
<h4 id="TLB"><a href="#TLB" class="headerlink" title="TLB"></a>TLB</h4><hr>
<p>页表是被缓存在内存中的，尽管内存的速度相对于硬盘来说已经非常快了，但与CPU还是有所差距。<strong>为了防止每次地址翻译操作都需要去访问内存，CPU使用了高速缓存与TLB来缓存PTE。</strong></p>
<p>在最糟糕的情况下（不包括缺页），MMU需要访问内存取得相应的PTE，这个代价大约为几十到几百个周期，如果PTE凑巧缓存在L1高速缓存中（如果L1没有还会从L2中查找，不过我们忽略多级缓冲区的细节），那么性能开销就会下降到1个或2个周期。然而，许多系统甚至需要消除即使这样微小的开销，TLB由此而生。</p>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fkz5z4klv8j20g106j3yf.jpg"></p>
<p>TLB（Translation Lookaside Buffer, TLB）被称为翻译后备缓冲器或翻译旁路缓冲器，它是<strong>MMU中的一个缓冲区，其中每一行都保存着一个由单个PTE组成的块。用于组选择和行匹配的索引与标记字段是从VPN中提取出来的，如果TLB中有<code>T = 2^t</code>个组，那么TLB索引（TLBI）是由VPN的t个最低位组成的，而TLB标记（TLBT）是由VPN中剩余的位组成的。</strong></p>
<p>下图为地址翻译的流程（TLB命中的情况下）：</p>
<p><img src="http://wx3.sinaimg.cn/large/63503acbly1fkz5z55afaj20vb0gxwf3.jpg"></p>
<ul>
<li><p>第一步，CPU将一个虚拟地址交给MMU进行地址翻译。</p>
</li>
<li><p>第二步和第三步，MMU通过TLB取得相应的PTE。</p>
</li>
<li><p>第四步，MMU通过PTE翻译出物理地址并将它发送给高速缓存/内存。</p>
</li>
<li><p>第五步，高速缓存返回数据到CPU（如果缓存命中的话，否则还需要访问内存）。</p>
</li>
</ul>
<p><strong>当TLB未命中时，MMU必须从高速缓存/内存中取出相应的PTE，并将新取得的PTE存放到TLB（如果TLB已满会覆盖一个已经存在的PTE）。</strong></p>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fkz5z5m1yij20vb0gxjs7.jpg"></p>
<h3 id="Linux中的虚拟内存系统"><a href="#Linux中的虚拟内存系统" class="headerlink" title="Linux中的虚拟内存系统"></a>Linux中的虚拟内存系统</h3><hr>
<p><strong>Linux为每个进程维护了一个单独的虚拟地址空间</strong>。虚拟地址空间分为内核空间与用户空间，用户空间包括代码、数据、堆、共享库以及栈，内核空间包括内核中的代码和数据结构，内核空间的某些区域被映射到所有进程共享的物理页面。<strong>Linux也将一组连续的虚拟页面（大小等于内存总量）映射到相应的一组连续的物理页面，这种做法为内核提供了一种便利的方法来访问物理内存中任何特定的位置。</strong></p>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fl00rjmbbuj20k90jyjt7.jpg"></p>
<p><strong>Linux将虚拟内存组织成一些区域（也称为段）的集合，区域的概念允许虚拟地址空间有间隙。一个区域就是已经存在着的已分配的虚拟内存的连续片（chunk）</strong>。例如，代码段、数据段、堆、共享库段，以及用户栈都属于不同的区域，<strong>每个存在的虚拟页都保存在某个区域中，而不属于任何区域的虚拟页是不存在的，也不能被进程所引用。</strong></p>
<p>内核为系统中的每个进程维护一个单独的任务结构（task_struct）。<strong>任务结构中的元素包含或者指向内核运行该进程所需的所有信息（PID、指向用户栈的指针、可执行目标文件的名字、程序计数器等）。</strong></p>
<p><img src="http://wx1.sinaimg.cn/large/63503acbly1fl00rk2akcj20wl0k2wfo.jpg"></p>
<ul>
<li><p>mm_struct：描述了虚拟内存的当前状态。pgd指向一级页表的基址（当内核运行这个进程时，pgd会被存放在CR3控制寄存器，也就是页表基址寄存器中），mmap指向一个vm_area_structs的链表，其中每个vm_area_structs都描述了当前虚拟地址空间的一个区域。</p>
</li>
<li><p>vm_starts：指向这个区域的起始处。</p>
</li>
<li><p>vm_end：指向这个区域的结束处。</p>
</li>
<li><p>vm_prot：描述这个区域内包含的所有页的读写许可权限。</p>
</li>
<li><p>vm_flags：描述这个区域内的页面是与其他进程共享的，还是这个进程私有的以及一些其他信息。</p>
</li>
<li><p>vm_next：指向链表的下一个区域结构。</p>
</li>
</ul>
<h3 id="内存映射"><a href="#内存映射" class="headerlink" title="内存映射"></a>内存映射</h3><hr>
<p><strong>Linux通过将一个虚拟内存区域与一个硬盘上的文件关联起来，以初始化这个虚拟内存区域的内容，这个过程称为内存映射（memory mapping）。这种将虚拟内存系统集成到文件系统的方法可以简单而高效地把程序和数据加载到内存中。</strong></p>
<p>一个区域可以映射到一个普通硬盘文件的连续部分，例如一个可执行目标文件。文件区（section）被分成页大小的片，每一片包含一个虚拟页的初始内容。<strong>由于按需页面调度的策略，这些虚拟页面没有实际交换进入物理内存，直到CPU引用的虚拟地址在该区域的范围内</strong>。如果区域比文件区要大，那么就用零来填充这个区域的余下部分。</p>
<p><strong>一个区域也可以映射到一个匿名文件，匿名文件是由内核创建的，包含的全是二进制零</strong>。当CPU第一次引用这样一个区域内的虚拟页面时，内核就在物理内存中找到一个合适的牺牲页面，如果该页面被修改过，就先将它写回到硬盘，之后用二进制零覆盖牺牲页并更新页表，将这个页面标记为已缓存在内存中的。</p>
<p>简单的来说：<strong>普通文件映射就是将一个文件与一块内存建立起映射关系，对该文件进行IO操作可以绕过内核直接在用户态完成（用户态在该虚拟地址区域读写就相当于读写这个文件）。匿名文件映射一般在用户空间需要分配一段内存来存放数据时，由内核创建匿名文件并与内存进行映射，之后用户态就可以通过操作这段虚拟地址来操作内存了。匿名文件映射最熟悉的应用场景就是动态内存分配（malloc()函数）。</strong></p>
<p>Linux很多地方都采用了“懒加载”机制，自然也包括内存映射。不管是普通文件映射还是匿名映射，Linux只会先划分虚拟内存地址。只有当CPU第一次访问该区域内的虚拟地址时，才会真正的与物理内存建立映射关系。</p>
<p><strong>只要虚拟页被初始化了，它就在一个由内核维护的交换文件（swap file）之间换来换去。交换文件又称为交换空间（swap space）或交换区域（swap area）。swap区域不止用于页交换，在物理内存不够的情况下，还会将部分内存数据交换到swap区域（使用硬盘来扩展内存）。</strong></p>
<h4 id="共享对象"><a href="#共享对象" class="headerlink" title="共享对象"></a>共享对象</h4><hr>
<p>虚拟内存系统为每个进程提供了私有的虚拟地址空间，这样可以保证进程之间不会发生错误的读写。但多个进程之间也含有相同的部分，例如每个C程序都使用到了C标准库，如果每个进程都在物理内存中保持这些代码的副本，那会造成很大的内存资源浪费。</p>
<p><strong>内存映射提供了共享对象的机制，来避免内存资源的浪费。一个对象被映射到虚拟内存的一个区域，要么是作为共享对象，要么是作为私有对象的。</strong></p>
<p>如果一个进程将一个共享对象映射到它的虚拟地址空间的一个区域内，那么这个进程对这个区域的任何写操作，对于那些也把这个共享对象映射到它们虚拟内存的其他进程而言，也是可见的。相对的，对一个映射到私有对象的区域的任何写操作，对于其他进程来说是不可见的。一个映射到共享对象的虚拟内存区域叫做共享区域，类似地，也有私有区域。</p>
<p><strong>为了节约内存，私有对象开始的生命周期与共享对象基本上是一致的（在物理内存中只保存私有对象的一份副本），并使用写时复制的技术来应对多个进程的写冲突。</strong></p>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fl0dggg7d9j218e0kgtab.jpg"></p>
<p>只要没有进程试图写它自己的私有区域，那么多个进程就可以继续共享物理内存中私有对象的一个单独副本。然而，只要有一个进程试图对私有区域的某一页面进行写操作，就会触发一个保护异常。在上图中，进程B试图对私有区域的一个页面进行写操作，该操作触发了保护异常。<strong>异常处理程序会在物理内存中创建这个页面的一个新副本，并更新PTE指向这个新的副本，然后恢复这个页的可写权限。</strong></p>
<p>还有一个典型的例子就是<code>fork()</code>函数，该函数用于创建子进程。当<code>fork()</code>函数被当前进程调用时，内核会为新进程创建各种必要的数据结构，并分配给它一个唯一的PID。为了给新进程创建虚拟内存，它复制了当前进程的<code>mm_struct</code>、<code>vm_area_struct</code>和页表的原样副本。并将两个进程的每个页面都标为只读，两个进程中的每个区域都标记为私有区域（写时复制）。</p>
<p>这样，父进程和子进程的虚拟内存空间完全一致，只有当这两个进程中的任一个进行写操作时，再使用写时复制来保证每个进程的虚拟地址空间私有的抽象概念。</p>
<h3 id="动态内存分配"><a href="#动态内存分配" class="headerlink" title="动态内存分配"></a>动态内存分配</h3><hr>
<p>虽然可以使用内存映射（<code>mmap()</code>函数）来创建和删除虚拟内存区域来满足运行时动态内存分配的问题。然而，为了更好的移植性与便利性，还需要一个更高层面的抽象，也就是动态内存分配器（dynamic memory allocator）。</p>
<p><strong>动态内存分配器维护着一个进程的虚拟内存区域，也就是我们所熟悉的“堆（heap）”，内核中还维护着一个指向堆顶的指针brk（break）。动态内存分配器将堆视为一个连续的虚拟内存块（chunk）的集合，每个块有两种状态，已分配和空闲。已分配的块显式地保留为供应用程序使用，空闲块则可以用来进行分配，它的空闲状态直到它显式地被应用程序分配为止。已分配的块要么被应用程序显式释放，要么被垃圾回收器所释放。</strong></p>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fl0lmtd3u8j20nz0a1dhr.jpg"></p>
<p>本文只讲解动态内存分配的一些概念，关于动态内存分配器的实现已经超出了本文的讨论范围。如果有对它感兴趣的同学，可以去参考<a target="_blank" rel="noopener" href="http://gee.cs.oswego.edu/">dlmalloc</a>的源码，它是由Doug Lea（就是写Java并发包的那位）实现的一个设计巧妙的内存分配器，而且源码中的注释十分多。</p>
<h4 id="内存碎片"><a href="#内存碎片" class="headerlink" title="内存碎片"></a>内存碎片</h4><hr>
<p><strong>造成堆的空间利用率很低的主要原因是一种被称为碎片（fragmentation）的现象，当虽然有未使用的内存但这块内存并不能满足分配请求时，就会产生碎片</strong>。有以下两种形式的碎片：</p>
<ul>
<li><p>内部碎片：在一个已分配块比有效载荷大时发生。例如，程序请求一个5字（这里我们不纠结字的大小，假设一个字为4字节，堆的大小为16字并且要保证边界双字对齐）的块，内存分配器为了保证空闲块是双字边界对齐的（具体实现中对齐的规定可能略有不同，但对齐是肯定会有的），只好分配一个6字的块。在本例中，已分配块为6字，有效载荷为5字，内部碎片为已分配块减去有效载荷，为1字。</p>
</li>
<li><p>外部碎片：当空闲内存合计起来足够满足一个分配请求，但是没有一个单独的空闲块足够大到可以来处理这个请求时发生。外部碎片难以量化且不可预测，所以分配器通常采用启发式策略来试图维持少量的大空闲块，而不是维持大量的小空闲块。分配器也会根据策略与分配请求的匹配来分割空闲块与合并空闲块（必须相邻）。</p>
</li>
</ul>
<h4 id="空闲链表"><a href="#空闲链表" class="headerlink" title="空闲链表"></a>空闲链表</h4><hr>
<p><strong>分配器将堆组织为一个连续的已分配块和空闲块的序列，该序列被称为空闲链表</strong>。空闲链表分为隐式空闲链表与显式空闲链表。</p>
<ul>
<li><p>隐式空闲链表，是一个单向链表，并且每个空闲块仅仅是通过头部中的大小字段隐含地连接着的。</p>
</li>
<li><p>显式空闲链表，即是将空闲块组织为某种形式的显式数据结构（为了更加高效地合并与分割空闲块）。例如，将堆组织为一个双向空闲链表，在每个空闲块中，都包含一个前驱节点的指针与后继节点的指针。</p>
</li>
</ul>
<p>查找一个空闲块一般有如下几种策略：</p>
<ul>
<li><p>首次适配：从头开始搜索空闲链表，选择第一个遇见的合适的空闲块。它的优点在于趋向于将大的空闲块保留在链表的后面，缺点是它趋向于在靠近链表前部处留下碎片。</p>
</li>
<li><p>下一次适配：每次从上一次查询结束的地方开始进行搜索，直到遇见合适的空闲块。这种策略通常比首次适配效率高，但是内存利用率则要低得多了。</p>
</li>
<li><p>最佳适配：检查每个空闲块，选择适合所需请求大小的最小空闲块。最佳适配的内存利用率是三种策略中最高的，但它需要对堆进行彻底的搜索。</p>
</li>
</ul>
<p>对一个链表进行查找操作的效率是线性的，为了减少分配请求对空闲块匹配的时间，<strong>分配器通常采用分离存储（segregated storage）的策略，即是维护多个空闲链表，其中每个链表的块有大致相等的大小。</strong></p>
<p>一种简单的分离存储策略：分配器维护一个空闲链表数组，然后将所有可能的块分成一些等价类（也叫做大小类（size class）），每个大小类代表一个空闲链表，并且每个大小类的空闲链表包含大小相等的块，每个块的大小就是这个大小类中最大元素的大小（例如，某个大小类的范围定义为（17~32），那么这个空闲链表全由大小为32的块组成）。</p>
<p>当有一个分配请求时，我们检查相应的空闲链表。如果链表非空，那么就分配其中第一块的全部。如果链表为空，分配器就向操作系统请求一个固定大小的额外内存片，将这个片分成大小相等的块，然后将这些块链接起来形成新的空闲链表。</p>
<p>要释放一个块，分配器只需要简单地将这个块插入到相应的空闲链表的头部。</p>
<h4 id="垃圾回收"><a href="#垃圾回收" class="headerlink" title="垃圾回收"></a>垃圾回收</h4><hr>
<p>在编写C程序时，一般只能显式地分配与释放堆中的内存（<code>malloc()</code>与<code>free()</code>），程序员不仅需要分配内存，还需要负责内存的释放。</p>
<p>许多现代编程语言都内置了自动内存管理机制（通过引入自动内存管理库也可以让C/C++实现自动内存管理），<strong>所谓自动内存管理，就是自动判断不再需要的堆内存（被称为垃圾内存），然后自动释放这些垃圾内存。</strong></p>
<p>自动内存管理的实现是垃圾收集器（garbage collector），它是一种动态内存分配器，它会自动释放应用程序不再需要的已分配块。</p>
<p>垃圾收集器一般采用以下两种（之一）的策略来判断一块堆内存是否为垃圾内存：</p>
<ul>
<li><p>引用计数器：在数据的物理空间中添加一个计数器，当有其他数据与其相关时（引用），该计数器加一，反之则减一。通过定期检查计数器的值，只要为0则认为是垃圾内存，可以释放它所占用的已分配块。使用引用计数器，实现简单直接，但缺点也很明显，它无法回收循环引用的两个对象（假设有对象A与对象B，它们2个互相引用，但实际上对象A与对象B都已经是没用的对象了）。</p>
</li>
<li><p>可达性分析：垃圾收集器将堆内存视为一张有向图，然后选出一组根节点（例如，在Java中一般为类加载器、全局变量、运行时常量池中的引用类型变量等），根节点必须是足够“活跃“的对象。然后计算从根节点集合出发的可达路径，只要从根节点出发不可达的节点，都视为垃圾内存。</p>
</li>
</ul>
<p>垃圾收集器进行回收的算法有如下几种：</p>
<ul>
<li><p>标记-清除：该算法分为标记（mark）和清除（sweep）两个阶段。首先标记出所有需要回收的对象，然后在标记完成后统一回收所有被标记的对象。标记-清除算法实现简单，但它的效率不高，而且会产生许多内存碎片。</p>
</li>
<li><p>标记-整理：标记-整理与标记-清除算法基本一致，只不过后续步骤不是直接对可回收对象进行清理，而是让所有存活的对象都向一端移动，然后直接清理掉边界以外的内存。</p>
</li>
<li><p>复制：<strong>将程序所拥有的内存空间划分为大小相等的两块，每次都只使用其中的一块。当这一块的内存用完了，就把还存活着的对象复制到另一块内存上，然后将已使用过的内存空间进行清理</strong>。这种方法不必考虑内存碎片问题，但内存利用率很低。这个比例不是绝对的，像HotSpot虚拟机为了避免浪费，将内存划分为Eden空间与两个Survivor空间，每次都只使用Eden和其中一个Survivor。当回收时，将Eden和Survivor中还存活着的对象一次性地复制到另外一个Survivor空间上，然后清理掉Eden和刚才使用过的Survivor空间。HotSpot虚拟机默认的Eden和Survivor的大小比例为8：1，只有10%的内存空间会被闲置浪费。</p>
</li>
<li><p>分代：<strong>分代算法根据对象的存活周期的不同将内存划分为多块，这样就可以对不同的年代采用不同的回收算法</strong>。一般分为新生代与老年代，新生代存放的是存活率较低的对象，可以采用复制算法；老年代存放的是存活率较高的对象，如果使用复制算法，那么内存空间会不够用，所以必须使用标记-清除或标记-整理算法。</p>
</li>
</ul>
<h3 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h3><hr>
<p>虚拟内存是对内存的一个抽象。支持虚拟内存的CPU需要通过虚拟寻址的方式来引用内存中的数据。CPU加载一个虚拟地址，然后发送给MMU进行地址翻译。地址翻译需要硬件与操作系统之间紧密合作，MMU借助页表来获得物理地址。</p>
<ul>
<li><p>首先，MMU先将虚拟地址发送给TLB以获得PTE（根据VPN寻址）。</p>
</li>
<li><p>如果恰好TLB中缓存了该PTE，那么就返回给MMU，否则MMU需要从高速缓存/内存中获得PTE，然后更新缓存到TLB。</p>
</li>
<li><p>MMU获得了PTE，就可以从PTE中获得对应的PPN，然后结合VPO构造出物理地址。</p>
</li>
<li><p>如果在PTE中发现该虚拟页没有缓存在内存，那么会触发一个缺页异常。缺页异常处理程序会把虚拟页缓存进物理内存，并更新PTE。异常处理程序返回后，CPU会重新加载这个虚拟地址，并进行翻译。</p>
</li>
</ul>
<p>虚拟内存系统简化了内存管理、链接、加载、代码和数据的共享以及访问权限的保护：</p>
<ul>
<li><p>简化链接，独立的地址空间允许每个进程的内存映像使用相同的基本格式，而不管代码和数据实际存放在物理内存的何处。</p>
</li>
<li><p>简化加载，虚拟内存使向内存中加载可执行文件和共享对象文件变得更加容易。</p>
</li>
<li><p>简化共享，独立的地址空间为操作系统提供了一个管理用户进程和内核之间共享的一致机制。</p>
</li>
<li><p>访问权限保护，每个虚拟地址都要经过查询PTE的过程，在PTE中设定访问权限的标记位从而简化内存的权限保护。</p>
</li>
</ul>
<p>操作系统通过将虚拟内存与文件系统结合的方式，来初始化虚拟内存区域，这个过程称为内存映射。应用程序显式分配内存的区域叫做堆，通过动态内存分配器来直接操作堆内存。</p>
<h3 id="参考文献"><a href="#参考文献" class="headerlink" title="参考文献"></a>参考文献</h3><hr>
<ul>
<li><p><a target="_blank" rel="noopener" href="http://www.csapp.cs.cmu.edu/">CS:APP3e, Bryant and O’Hallaron</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Virtual_memory">Virtual memory - Wikipedia</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Garbage_collection_(computer_science)">Garbage collection (computer science) - Wikipedia</a></p>
</li>
</ul>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/10/15/2017-10-15-JavaAnnotation/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/10/15/2017-10-15-JavaAnnotation/" class="post-title-link" itemprop="url">注解的那点事儿</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-10-15 18:00:00" itemprop="dateCreated datePublished" datetime="2017-10-15T18:00:00+08:00">2017-10-15</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%90%8E%E7%AB%AF/" itemprop="url" rel="index"><span itemprop="name">后端</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%90%8E%E7%AB%AF/Java/" itemprop="url" rel="index"><span itemprop="name">Java</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <h3 id="什么是注解"><a href="#什么是注解" class="headerlink" title="什么是注解?"></a>什么是注解?</h3><hr>
<p><strong>注解是<code>JDK1.5</code>引入的一个语法糖，它主要用来当作元数据，简单的说就是用于解释数据的数据</strong>。在Java中，类、方法、变量、参数、包都可以被注解。很多开源框架都使用了注解，例如<code>Spring</code>、<code>MyBatis</code>、<code>Junit</code>。我们平常最常见的注解可能就是<code>@Override</code>了，该注解用来标识一个重写的函数。</p>
<p>注解的作用：</p>
<ul>
<li><p>配置文件：替代<code>xml</code>等文本文件格式的配置文件。使用注解作为配置文件可以在代码中实现动态配置，相比外部配置文件，注解的方式会减少很多文本量。但缺点也很明显，更改配置需要对代码进行重新编译，无法像外部配置文件一样进行集中管理（所以现在基本都是外部配置文件+注解混合使用）。</p>
</li>
<li><p>数据的标记：注解可以作为一个标记（例如：被<code>@Override</code>标记的方法代表被重写的方法）。</p>
</li>
<li><p>减少重复代码：注解可以减少重复且乏味的代码。比如我们定义一个<code>@ValidateInt</code>，然后通过反射来获得类中所有成员变量，只要是含有<code>@ValidateInt</code>注解的成员变量，我们就可以对其进行数据的规则校验。</p>
</li>
</ul>
<p>定义一个注解非常简单，只需要遵循以下的语法规则：</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line"><span class="meta">@Retention(RetentionPolicy.RUNTIME)</span></span><br><span class="line"><span class="meta">@Target(ElementType.FIELD)</span></span><br><span class="line"><span class="meta">@Documented</span></span><br><span class="line"><span class="keyword">public</span> <span class="meta">@interface</span> ValidateInt &#123;</span><br><span class="line">	<span class="comment">// 它们看起来像是定义一个函数，但其实这是注解中的属性</span></span><br><span class="line">    <span class="function"><span class="keyword">int</span> <span class="title">maxLength</span><span class="params">()</span></span>;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">int</span> <span class="title">minLength</span><span class="params">()</span></span>;</span><br><span class="line"></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>我们发现上面的代码在定义注解时也使用了注解，这些注解被称为元注解。<strong>作用于注解上的注解称为元注解（元注解其实就是注解的元数据）</strong>，<code>Java</code>中一共有以下元注解。</p>
<ul>
<li><p><code>@Target</code>：用于描述注解的使用范围（注解可以用在什么地方）。</p>
<ul>
<li><p>ElementType.CONSTRUCTOR：构造器。</p>
</li>
<li><p>ElementType.FIELD：成员变量。</p>
</li>
<li><p>ElementType.LOCAL_VARIABLE：局部变量。</p>
</li>
<li><p>ElementType.PACKAGE：包。</p>
</li>
<li><p>ElementType.PARAMETER：参数。</p>
</li>
<li><p>ElementType.METHOD：方法。</p>
</li>
<li><p>ElementType.TYPE：类、接口(包括注解类型) 或enum声明。</p>
</li>
</ul>
</li>
<li><p><code>@Retention</code>：注解的生命周期，用于表示该注解会在什么时期保留。</p>
<ul>
<li><p>RetentionPolicy.RUNTIME：运行时保留，这样就可以通过反射获得了。</p>
</li>
<li><p>RetentionPolicy.CLASS：在class文件中保留。</p>
</li>
<li><p>RetentionPolicy.SOURCE：在源文件中保留。</p>
</li>
</ul>
</li>
<li><p><code>@Documented</code>：表示该注解会被作为被标注的程序成员的公共API，因此可以被例如javadoc此类的工具文档化。</p>
</li>
<li><p><code>@Inherited</code>：表示该注解是可被继承的（如果一个使用了@Inherited修饰的annotation类型被用于一个class，则这个annotation将被用于该class的子类）。</p>
</li>
</ul>
<p>了解了这些基础知识之后，接着完成上述定义的<code>@ValidateInt</code>，我们定义一个<code>Cat</code>类然后在它的成员变量中使用<code>@ValidateInt</code>，并通过反射进行数据校验。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">Cat</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">private</span> String name;</span><br><span class="line"></span><br><span class="line">    <span class="meta">@ValidateInt(minLength = 0, maxLength = 10)</span></span><br><span class="line">    <span class="keyword">private</span> <span class="keyword">int</span> age;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> String <span class="title">getName</span><span class="params">()</span> </span>&#123;</span><br><span class="line">        <span class="keyword">return</span> name;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">setName</span><span class="params">(String name)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">this</span>.name = name;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">int</span> <span class="title">getAge</span><span class="params">()</span> </span>&#123;</span><br><span class="line">        <span class="keyword">return</span> age;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">setAge</span><span class="params">(<span class="keyword">int</span> age)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">this</span>.age = age;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">main</span><span class="params">(String[] args)</span> <span class="keyword">throws</span> IllegalAccessException </span>&#123;</span><br><span class="line">        Cat cat = <span class="keyword">new</span> Cat();</span><br><span class="line">        cat.setName(<span class="string">&quot;楼楼&quot;</span>);</span><br><span class="line">        cat.setAge(<span class="number">11</span>);</span><br><span class="line"></span><br><span class="line">        Class&lt;? extends Cat&gt; clazz = cat.getClass();</span><br><span class="line">        Field[] fields = clazz.getDeclaredFields();</span><br><span class="line">        <span class="keyword">if</span> (fields != <span class="keyword">null</span>) &#123;</span><br><span class="line">            <span class="keyword">for</span> (Field field : fields) &#123;</span><br><span class="line">                ValidateInt annotation = field.getDeclaredAnnotation(ValidateInt.class);</span><br><span class="line">                <span class="keyword">if</span> (annotation != <span class="keyword">null</span>) &#123;</span><br><span class="line">                    field.setAccessible(<span class="keyword">true</span>);</span><br><span class="line">                    <span class="keyword">int</span> value = field.getInt(cat);</span><br><span class="line">                    <span class="keyword">if</span> (value &lt; annotation.minLength()) &#123;</span><br><span class="line">                        <span class="comment">// ....</span></span><br><span class="line">                    &#125; <span class="keyword">else</span> <span class="keyword">if</span> (value &gt; annotation.maxLength()) &#123;</span><br><span class="line">                        <span class="comment">// ....</span></span><br><span class="line">                    &#125;</span><br><span class="line">                &#125;</span><br><span class="line">            &#125;</span><br><span class="line">        &#125;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<blockquote>
<p>本文作者为:<a target="_blank" rel="noopener" href="https://github.com/SylvanasSun">SylvanasSun(sylvanas.sun@gmail.com)</a>，首发于<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun’s Blog</a>。<br>原文链接：<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/10/15/2017-10-15-JavaAnnotation/">https://sylvanassun.github.io/2017/10/15/2017-10-15-JavaAnnotation/</a><br>（转载请务必保留本段声明，并且保留超链接。）</p>
</blockquote>
<h3 id="注解的实现"><a href="#注解的实现" class="headerlink" title="注解的实现"></a>注解的实现</h3><hr>
<p>注解其实只是<code>Java</code>的一颗语法糖（语法糖是一种方便程序员使用的语法规则，但它其实并没有表面上那么神奇的功能，只不过是由编译器帮程序员生成那些繁琐的代码）。在<code>Java</code>中这样的语法糖还有很多，例如<code>enum</code>、泛型、<code>forEach</code>等。</p>
<p>通过阅读<a target="_blank" rel="noopener" href="http://docs.oracle.com/javase/specs/jls/se8/html/jls-9.html#jls-9.6">JLS(Java Language Specification</a>（当你想了解一个语言特性的实现时，最好的方法就是阅读官方规范）发现，<strong>注解是一个继承自<code>java.lang.annotation.Annotation</code>接口的特殊接口</strong>，原文如下：</p>
<figure class="highlight text"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line">An annotation type declaration specifies a new annotation type, a special kind of interface type. To distinguish an annotation type declaration from a normal interface declaration, the keyword interface is preceded by an at-sign (@).</span><br><span class="line"></span><br><span class="line">Note that the at-sign (@) and the keyword interface are distinct tokens. It is possible to separate them with whitespace, but this is discouraged as a matter of style.</span><br><span class="line"></span><br><span class="line">The rules for annotation modifiers on an annotation type declaration are specified in §9.7.4 and §9.7.5.</span><br><span class="line"></span><br><span class="line">The Identifier in an annotation type declaration specifies the name of the annotation type.</span><br><span class="line"></span><br><span class="line">It is a compile-time error if an annotation type has the same simple name as any of its enclosing classes or interfaces.</span><br><span class="line"></span><br><span class="line">The direct superinterface of every annotation type is java.lang.annotation.Annotation.</span><br></pre></td></tr></table></figure>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">package</span> java.lang.annotation;</span><br><span class="line"></span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * The common interface extended by all annotation types.  Note that an</span></span><br><span class="line"><span class="comment"> * interface that manually extends this one does &lt;i&gt;not&lt;/i&gt; define</span></span><br><span class="line"><span class="comment"> * an annotation type.  Also note that this interface does not itself</span></span><br><span class="line"><span class="comment"> * define an annotation type.</span></span><br><span class="line"><span class="comment"> *</span></span><br><span class="line"><span class="comment"> * More information about annotation types can be found in section 9.6 of</span></span><br><span class="line"><span class="comment"> * &lt;cite&gt;The Java&amp;trade; Language Specification&lt;/cite&gt;.</span></span><br><span class="line"><span class="comment"> *</span></span><br><span class="line"><span class="comment"> * The &#123;<span class="doctag">@link</span> java.lang.reflect.AnnotatedElement&#125; interface discusses</span></span><br><span class="line"><span class="comment"> * compatibility concerns when evolving an annotation type from being</span></span><br><span class="line"><span class="comment"> * non-repeatable to being repeatable.</span></span><br><span class="line"><span class="comment"> *</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@author</span>  Josh Bloch</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@since</span>   1.5</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">interface</span> <span class="title">Annotation</span> </span>&#123;</span><br><span class="line">    ...</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>我们将上节定义的<code>@ValidateInt</code>注解进行反编译来验证这个说法。</p>
<figure class="highlight"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br></pre></td><td class="code"><pre><span class="line">  Last modified Oct <span class="number">14</span>, <span class="number">2017</span>; size <span class="number">479</span> bytes</span><br><span class="line">  MD5 checksum 2d9dd2c169fe854db608c7950af3eca7</span><br><span class="line">  Compiled from <span class="string">&quot;ValidateInt.java&quot;</span></span><br><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">interface</span> <span class="title">com</span>.<span class="title">sun</span>.<span class="title">annotation</span>.<span class="title">ValidateInt</span> <span class="keyword">extends</span> <span class="title">java</span>.<span class="title">lang</span>.<span class="title">annotation</span>.<span class="title">Annotation</span></span></span><br><span class="line">  minor version: 0</span><br><span class="line">  major version: <span class="number">52</span></span><br><span class="line">  flags: ACC_PUBLIC, ACC_INTERFACE, ACC_ABSTRACT, ACC_ANNOTATION</span><br><span class="line">Constant pool:</span><br><span class="line">   #1 = Class              #18            // com/sun/annotation/ValidateInt</span><br><span class="line">   #2 = Class              #19            // java/lang/Object</span><br><span class="line">   #3 = Class              #20            // java/lang/annotation/Annotation</span><br><span class="line">   #4 = Utf8               maxLength</span><br><span class="line">   #5 = Utf8               ()I</span><br><span class="line">   #6 = Utf8               minLength</span><br><span class="line">   #7 = Utf8               SourceFile</span><br><span class="line">   #8 = Utf8               ValidateInt.java</span><br><span class="line">   #9 = Utf8               RuntimeVisibleAnnotations</span><br><span class="line">  #10 = Utf8               Ljava/lang/annotation/Retention;</span><br><span class="line">  #11 = Utf8               value</span><br><span class="line">  #12 = Utf8               Ljava/lang/annotation/RetentionPolicy;</span><br><span class="line">  #13 = Utf8               RUNTIME</span><br><span class="line">  #14 = Utf8               Ljava/lang/annotation/Target;</span><br><span class="line">  #15 = Utf8               Ljava/lang/annotation/ElementType;</span><br><span class="line">  #16 = Utf8               FIELD</span><br><span class="line">  #17 = Utf8               Ljava/lang/annotation/Documented;</span><br><span class="line">  #18 = Utf8               com/sun/annotation/ValidateInt</span><br><span class="line">  #19 = Utf8               java/lang/Object</span><br><span class="line">  #20 = Utf8               java/lang/annotation/Annotation</span><br><span class="line">&#123;</span><br><span class="line">  <span class="function"><span class="keyword">public</span> <span class="keyword">abstract</span> <span class="keyword">int</span> <span class="title">maxLength</span><span class="params">()</span></span>;</span><br><span class="line">    descriptor: ()I</span><br><span class="line">    flags: ACC_PUBLIC, ACC_ABSTRACT</span><br><span class="line"></span><br><span class="line">  <span class="function"><span class="keyword">public</span> <span class="keyword">abstract</span> <span class="keyword">int</span> <span class="title">minLength</span><span class="params">()</span></span>;</span><br><span class="line">    descriptor: ()I</span><br><span class="line">    flags: ACC_PUBLIC, ACC_ABSTRACT</span><br><span class="line">&#125;</span><br><span class="line">SourceFile: <span class="string">&quot;ValidateInt.java&quot;</span></span><br><span class="line">RuntimeVisibleAnnotations:</span><br><span class="line">  0: #10(#11=e#12.#13)</span><br><span class="line">  1: #14(#11=[e#15.#16])</span><br><span class="line">  2: #17()</span><br></pre></td></tr></table></figure>
<p><code>public interface com.sun.annotation.ValidateInt extends java.lang.annotation.Annotation</code>，很明显<code>ValidateInt</code>继承自<code>java.lang.annotation.Annotation</code>。</p>
<p>那么，如果注解只是一个接口，又是如何实现对属性的设置呢？这是<strong>因为<code>Java</code>使用了动态代理对我们定义的注解接口生成了一个代理类，而对注解的属性设置其实都是在对这个代理类中的变量进行赋值</strong>。所以我们才能用反射获得注解中的各种属性。</p>
<p>为了证实注解其实是个动态代理对象，接下来我们使用<code>CLHSDB(Command-Line HotSpot Debugger)</code>来查看<code>JVM</code>的运行时数据。如果有童鞋不了解怎么使用的话，可以参考R大的文章<a target="_blank" rel="noopener" href="http://rednaxelafx.iteye.com/blog/1847971">借HSDB来探索HotSpot VM的运行时数据 - Script Ahead, Code Behind - ITeye博客</a>。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line"><span class="number">0x000000000257f538</span> com/sun/proxy/$Proxy1</span><br></pre></td></tr></table></figure>
<p>注解的类型为<code>com/sun/proxy/$Proxy1</code>，这正是动态代理生成代理类的默认类型，<code>com/sun/proxy</code>为默认包名，<code>$Proxy</code>是默认的类名，<code>1</code>为自增的编号。</p>
<h3 id="实践-包扫描器"><a href="#实践-包扫描器" class="headerlink" title="实践-包扫描器"></a>实践-包扫描器</h3><hr>
<p>我们在使用<code>Spring</code>的时候，只需要指定一个包名，框架就会去扫描该包下所有带有<code>Spring</code>中的注解的类。实现一个包扫描器很简单，主要思路如下：</p>
<ul>
<li><p>先将传入的包名通过类加载器获得项目内的路径。</p>
</li>
<li><p>然后遍历并获得该路径下的所有class文件路径（需要处理为包名的格式）。</p>
</li>
<li><p>得到了class文件的路径就可以使用反射生成Class对象并获得其中的各种信息了。</p>
</li>
</ul>
<p>定义包扫描器接口：</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">interface</span> <span class="title">PackageScanner</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    List&lt;Class&lt;?&gt;&gt; scan(String packageName);</span><br><span class="line"></span><br><span class="line">    List&lt;Class&lt;?&gt;&gt; scan(String packageName, ScannedClassHandler handler);</span><br><span class="line"></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>函数2需要传入一个<code>ScannedClassHandler</code>接口，该接口是我们定义的回调函数，用于在扫描所有类文件之后执行的处理操作。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="meta">@FunctionalInterface</span> <span class="comment">// 这个注解表示该接口为一个函数接口，用于支持Lambda表达式</span></span><br><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">interface</span> <span class="title">ScannedClassHandler</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">void</span> <span class="title">execute</span><span class="params">(Class&lt;?&gt; clazz)</span></span>;</span><br><span class="line"></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>我想要包扫描器可以识别和支持不同的文件类型，定义一个枚举类<code>ResourceType</code>：</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">enum</span> <span class="title">ResourceType</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    JAR(<span class="string">&quot;jar&quot;</span>),</span><br><span class="line">    FILE(<span class="string">&quot;file&quot;</span>),</span><br><span class="line">    CLASS_FILE(<span class="string">&quot;class&quot;</span>),</span><br><span class="line">    INVALID(<span class="string">&quot;invalid&quot;</span>);</span><br><span class="line"></span><br><span class="line">    <span class="keyword">private</span> String typeName;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> String <span class="title">getTypeName</span><span class="params">()</span> </span>&#123;</span><br><span class="line">        <span class="keyword">return</span> <span class="keyword">this</span>.typeName;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">private</span> <span class="title">ResourceType</span><span class="params">(String typeName)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">this</span>.typeName = typeName;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p><code>PathUtils</code>是一个用来处理路径和包转换等操作的工具类：</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">PathUtils</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> String FILE_SEPARATOR = System.getProperty(<span class="string">&quot;file.separator&quot;</span>);</span><br><span class="line"></span><br><span class="line">    <span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> String CLASS_FILE_SUFFIX = <span class="string">&quot;.class&quot;</span>;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> String JAR_PROTOCOL = <span class="string">&quot;jar&quot;</span>;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> String FILE_PROTOCOL = <span class="string">&quot;file&quot;</span>;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">private</span> <span class="title">PathUtils</span><span class="params">()</span> </span>&#123;</span><br><span class="line">    &#125;</span><br><span class="line">	</span><br><span class="line">	<span class="comment">// 去除后缀名</span></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> String <span class="title">trimSuffix</span><span class="params">(String filename)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">if</span> (filename == <span class="keyword">null</span> || <span class="string">&quot;&quot;</span>.equals(filename))</span><br><span class="line">            <span class="keyword">return</span> filename;</span><br><span class="line"></span><br><span class="line">        <span class="keyword">int</span> dotIndex = filename.lastIndexOf(<span class="string">&quot;.&quot;</span>);</span><br><span class="line">        <span class="keyword">if</span> (-<span class="number">1</span> == dotIndex)</span><br><span class="line">            <span class="keyword">return</span> filename;</span><br><span class="line">        <span class="keyword">return</span> filename.substring(<span class="number">0</span>, dotIndex);</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> String <span class="title">pathToPackage</span><span class="params">(String path)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">if</span> (path == <span class="keyword">null</span> || <span class="string">&quot;&quot;</span>.equals(path))</span><br><span class="line">            <span class="keyword">return</span> path;</span><br><span class="line"></span><br><span class="line">        <span class="keyword">if</span> (path.startsWith(FILE_SEPARATOR))</span><br><span class="line">            path = path.substring(<span class="number">1</span>);</span><br><span class="line">        <span class="keyword">return</span> path.replace(FILE_SEPARATOR, <span class="string">&quot;.&quot;</span>);</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> String <span class="title">packageToPath</span><span class="params">(String packageName)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">if</span> (packageName == <span class="keyword">null</span> || <span class="string">&quot;&quot;</span>.equals(packageName))</span><br><span class="line">            <span class="keyword">return</span> packageName;</span><br><span class="line">        <span class="keyword">return</span> packageName.replace(<span class="string">&quot;.&quot;</span>, FILE_SEPARATOR);</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="comment">/**</span></span><br><span class="line"><span class="comment">     * 根据URL的协议来判断资源类型</span></span><br><span class="line"><span class="comment">     */</span></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> ResourceType <span class="title">getResourceType</span><span class="params">(URL url)</span> </span>&#123;</span><br><span class="line">        String protocol = url.getProtocol();</span><br><span class="line">        <span class="keyword">switch</span> (protocol) &#123;</span><br><span class="line">            <span class="keyword">case</span> JAR_PROTOCOL:</span><br><span class="line">                <span class="keyword">return</span> ResourceType.JAR;</span><br><span class="line">            <span class="keyword">case</span> FILE_PROTOCOL:</span><br><span class="line">                <span class="keyword">return</span> ResourceType.FILE;</span><br><span class="line">            <span class="keyword">default</span>:</span><br><span class="line">                <span class="keyword">return</span> ResourceType.INVALID;</span><br><span class="line">        &#125;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">boolean</span> <span class="title">isClassFile</span><span class="params">(String path)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">if</span> (path == <span class="keyword">null</span> || <span class="string">&quot;&quot;</span>.equals(path))</span><br><span class="line">            <span class="keyword">return</span> <span class="keyword">false</span>;</span><br><span class="line">        <span class="keyword">return</span> path.endsWith(CLASS_FILE_SUFFIX);</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="comment">/**</span></span><br><span class="line"><span class="comment">     * 抽取URL中的主要路径.</span></span><br><span class="line"><span class="comment">     * Example:</span></span><br><span class="line"><span class="comment">     * &quot;file:/com/example/hello&quot; to &quot;/com/example/hello&quot;</span></span><br><span class="line"><span class="comment">     * &quot;jar:file:/com/example/hello.jar!/&quot; to &quot;/com/example/hello.jar&quot;</span></span><br><span class="line"><span class="comment">     */</span></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> String <span class="title">getUrlMainPath</span><span class="params">(URL url)</span> <span class="keyword">throws</span> UnsupportedEncodingException </span>&#123;</span><br><span class="line">        <span class="keyword">if</span> (url == <span class="keyword">null</span>)</span><br><span class="line">            <span class="keyword">return</span> <span class="string">&quot;&quot;</span>;</span><br><span class="line">		</span><br><span class="line">		<span class="comment">// 如果不使用URLDecoder解码的话，路径会出现中文乱码问题</span></span><br><span class="line">        String filePath = URLDecoder.decode(url.getFile(), <span class="string">&quot;utf-8&quot;</span>);</span><br><span class="line">        <span class="comment">// if file is not the jar</span></span><br><span class="line">        <span class="keyword">int</span> pos = filePath.indexOf(<span class="string">&quot;!&quot;</span>);</span><br><span class="line">        <span class="keyword">if</span> (-<span class="number">1</span> == pos)</span><br><span class="line">            <span class="keyword">return</span> filePath;</span><br><span class="line"></span><br><span class="line">        <span class="keyword">return</span> filePath.substring(<span class="number">5</span>, pos);</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> String <span class="title">concat</span><span class="params">(Object... args)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">if</span> (args == <span class="keyword">null</span> || args.length == <span class="number">0</span>)</span><br><span class="line">            <span class="keyword">return</span> <span class="string">&quot;&quot;</span>;</span><br><span class="line"></span><br><span class="line">        StringBuilder stringBuilder = <span class="keyword">new</span> StringBuilder();</span><br><span class="line">        <span class="keyword">for</span> (<span class="keyword">int</span> i = <span class="number">0</span>; i &lt; args.length; i++)</span><br><span class="line">            stringBuilder.append(args[i]);</span><br><span class="line"></span><br><span class="line">        <span class="keyword">return</span> stringBuilder.toString();</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>定义了这些辅助类之后，就可以去实现包扫描器了。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">SimplePackageScanner</span> <span class="keyword">implements</span> <span class="title">PackageScanner</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">protected</span> String packageName;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">protected</span> String packagePath;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">protected</span> ClassLoader classLoader;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">private</span> Logger logger;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="title">SimplePackageScanner</span><span class="params">()</span> </span>&#123;</span><br><span class="line">        <span class="keyword">this</span>.classLoader = Thread.currentThread().getContextClassLoader();</span><br><span class="line">        <span class="keyword">this</span>.logger = LoggerFactory.getLogger(SimplePackageScanner.class);</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="meta">@Override</span></span><br><span class="line">    <span class="keyword">public</span> List&lt;Class&lt;?&gt;&gt; scan(String packageName) &#123;</span><br><span class="line">        <span class="keyword">return</span> <span class="keyword">this</span>.scan(packageName, <span class="keyword">null</span>);</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="meta">@Override</span></span><br><span class="line">    <span class="keyword">public</span> List&lt;Class&lt;?&gt;&gt; scan(String packageName, ScannedClassHandler handler) &#123;</span><br><span class="line">        <span class="keyword">this</span>.initPackageNameAndPath(packageName);</span><br><span class="line">        <span class="keyword">if</span> (logger.isDebugEnabled())</span><br><span class="line">            logger.debug(<span class="string">&quot;Start scanning package: &#123;&#125; ....&quot;</span>, <span class="keyword">this</span>.packageName);</span><br><span class="line">        URL url = <span class="keyword">this</span>.getResource(<span class="keyword">this</span>.packagePath);</span><br><span class="line">        <span class="keyword">if</span> (url == <span class="keyword">null</span>)</span><br><span class="line">            <span class="keyword">return</span> <span class="keyword">new</span> ArrayList&lt;&gt;();</span><br><span class="line">        <span class="keyword">return</span> <span class="keyword">this</span>.parseUrlThenScan(url, handler);</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">private</span> <span class="keyword">void</span> <span class="title">initPackageNameAndPath</span><span class="params">(String packageName)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">this</span>.packageName = packageName;</span><br><span class="line">        <span class="keyword">this</span>.packagePath = PathUtils.packageToPath(packageName);</span><br><span class="line">    &#125;</span><br><span class="line">	</span><br><span class="line">&#125;	</span><br></pre></td></tr></table></figure>
<p>函数<code>getResource()</code>会根据包名来通过类加载器获得当前项目下的URL对象，如果这个URL为空则直接返回一个空的<code>ArrayList</code>。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">protected</span> URL <span class="title">getResource</span><span class="params">(String packagePath)</span> </span>&#123;</span><br><span class="line">    URL url = <span class="keyword">this</span>.classLoader.getResource(packagePath);</span><br><span class="line">    <span class="keyword">if</span> (url != <span class="keyword">null</span>)</span><br><span class="line">        logger.debug(<span class="string">&quot;Get resource: &#123;&#125; success!&quot;</span>, packagePath);</span><br><span class="line">    <span class="keyword">else</span></span><br><span class="line">        logger.debug(<span class="string">&quot;Get resource: &#123;&#125; failed,end of scan.&quot;</span>, packagePath);</span><br><span class="line">    <span class="keyword">return</span> url;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>函数<code>parseUrlThenScan()</code>会解析URL对象并进行扫描，最终返回一个类列表。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br></pre></td><td class="code"><pre><span class="line">  <span class="keyword">protected</span> List&lt;Class&lt;?&gt;&gt; parseUrlThenScan(URL url, ScannedClassHandler handler) &#123;</span><br><span class="line">      String urlPath = <span class="string">&quot;&quot;</span>;</span><br><span class="line">      <span class="keyword">try</span> &#123;</span><br><span class="line">    <span class="comment">// 先提取出URL中的路径（不含协议名等信息）</span></span><br><span class="line">          urlPath = PathUtils.getUrlMainPath(url);</span><br><span class="line">      &#125; <span class="keyword">catch</span> (UnsupportedEncodingException e) &#123;</span><br><span class="line">          e.printStackTrace();</span><br><span class="line">          logger.debug(<span class="string">&quot;Get url path failed.&quot;</span>);</span><br><span class="line">      &#125;</span><br><span class="line"></span><br><span class="line">      <span class="comment">// 判断URL的类型</span></span><br><span class="line">      ResourceType type = PathUtils.getResourceType(url);</span><br><span class="line">      List&lt;Class&lt;?&gt;&gt; classList = <span class="keyword">new</span> ArrayList&lt;&gt;();</span><br><span class="line"></span><br><span class="line">      <span class="keyword">try</span> &#123;</span><br><span class="line">          <span class="keyword">switch</span> (type) &#123;</span><br><span class="line">              <span class="keyword">case</span> FILE:</span><br><span class="line">                  classList = <span class="keyword">this</span>.getClassListFromFile(urlPath, <span class="keyword">this</span>.packageName);</span><br><span class="line">                  <span class="keyword">break</span>;</span><br><span class="line">              <span class="keyword">case</span> JAR:</span><br><span class="line">                  classList = <span class="keyword">this</span>.getClassListFromJar(urlPath);</span><br><span class="line">                  <span class="keyword">break</span>;</span><br><span class="line">              <span class="keyword">default</span>:</span><br><span class="line">                  logger.debug(<span class="string">&quot;Unsupported file type.&quot;</span>);</span><br><span class="line">          &#125;</span><br><span class="line">      &#125; <span class="keyword">catch</span> (IOException | ClassNotFoundException e) &#123;</span><br><span class="line">          e.printStackTrace();</span><br><span class="line">          logger.debug(<span class="string">&quot;Get class list failed.&quot;</span>);</span><br><span class="line">      &#125;</span><br><span class="line"></span><br><span class="line"><span class="comment">// 执行回调函数</span></span><br><span class="line">      <span class="keyword">this</span>.invokeCallback(classList, handler);</span><br><span class="line">      logger.debug(<span class="string">&quot;End of scan &lt;&#123;&#125;&gt;.&quot;</span>, urlPath);</span><br><span class="line">      <span class="keyword">return</span> classList;</span><br><span class="line">  &#125;</span><br></pre></td></tr></table></figure>
<p>函数<code>getClassListFromFile()</code>会扫描路径下的所有class文件，并拼接包名生成Class对象。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">protected</span> List&lt;Class&lt;?&gt;&gt; getClassListFromFile(String path, String packageName) <span class="keyword">throws</span> ClassNotFoundException &#123;</span><br><span class="line">    File file = <span class="keyword">new</span> File(path);</span><br><span class="line">    List&lt;Class&lt;?&gt;&gt; classList = <span class="keyword">new</span> ArrayList&lt;&gt;();</span><br><span class="line"></span><br><span class="line">    File[] listFiles = file.listFiles();</span><br><span class="line">    <span class="keyword">if</span> (listFiles != <span class="keyword">null</span>) &#123;</span><br><span class="line">        <span class="keyword">for</span> (File f : listFiles) &#123;</span><br><span class="line">            <span class="keyword">if</span> (f.isDirectory()) &#123;</span><br><span class="line">	<span class="comment">// 如果是一个文件夹，则继续递归调用，注意传递的包名</span></span><br><span class="line">                List&lt;Class&lt;?&gt;&gt; list = getClassListFromFile(f.getAbsolutePath(),</span><br><span class="line">                        PathUtils.concat(packageName, <span class="string">&quot;.&quot;</span>, f.getName()));</span><br><span class="line">                classList.addAll(list);</span><br><span class="line">            &#125; <span class="keyword">else</span> <span class="keyword">if</span> (PathUtils.isClassFile(f.getName())) &#123;</span><br><span class="line">                <span class="comment">// 我们不添加名字带有$的class文件，这些都是JVM动态生成的</span></span><br><span class="line">                String className = PathUtils.trimSuffix(f.getName());</span><br><span class="line">                <span class="keyword">if</span> (-<span class="number">1</span> != className.lastIndexOf(<span class="string">&quot;$&quot;</span>))</span><br><span class="line">                    <span class="keyword">continue</span>;</span><br><span class="line"></span><br><span class="line">                String finalClassName = PathUtils.concat(packageName, <span class="string">&quot;.&quot;</span>, className);</span><br><span class="line">                classList.add(Class.forName(finalClassName));</span><br><span class="line">            &#125;</span><br><span class="line">        &#125;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">return</span> classList;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>函数<code>getClassListFromJar()</code>会扫描Jar中的class文件。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">protected</span> List&lt;Class&lt;?&gt;&gt; getClassListFromJar(String jarPath) <span class="keyword">throws</span> IOException, ClassNotFoundException &#123;</span><br><span class="line">    <span class="keyword">if</span> (logger.isDebugEnabled())</span><br><span class="line">        logger.debug(<span class="string">&quot;Start scanning jar: &#123;&#125;&quot;</span>, jarPath);</span><br><span class="line"></span><br><span class="line">    JarInputStream jarInputStream = <span class="keyword">new</span> JarInputStream(<span class="keyword">new</span> FileInputStream(jarPath));</span><br><span class="line">    JarEntry jarEntry = jarInputStream.getNextJarEntry();</span><br><span class="line">    List&lt;Class&lt;?&gt;&gt; classList = <span class="keyword">new</span> ArrayList&lt;&gt;();</span><br><span class="line"></span><br><span class="line">    <span class="keyword">while</span> (jarEntry != <span class="keyword">null</span>) &#123;</span><br><span class="line">        String name = jarEntry.getName();</span><br><span class="line">        <span class="keyword">if</span> (name.startsWith(<span class="keyword">this</span>.packageName) &amp;&amp; PathUtils.isClassFile(name))</span><br><span class="line">            classList.add(Class.forName(name));</span><br><span class="line">        jarEntry = jarInputStream.getNextJarEntry();</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">return</span> classList;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>函数<code>invokeCallback()</code>遍历类对象列表，然后执行回调函数。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">protected</span> <span class="keyword">void</span> <span class="title">invokeCallback</span><span class="params">(List&lt;Class&lt;?&gt;&gt; classList, ScannedClassHandler handler)</span> </span>&#123;</span><br><span class="line">    <span class="keyword">if</span> (classList != <span class="keyword">null</span> &amp;&amp; handler != <span class="keyword">null</span>) &#123;</span><br><span class="line">        <span class="keyword">for</span> (Class&lt;?&gt; clazz : classList) &#123;</span><br><span class="line">            handler.execute(clazz);</span><br><span class="line">        &#125;</span><br><span class="line">    &#125;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>本节中实现的包扫描器源码地址：<a target="_blank" rel="noopener" href="https://gist.github.com/SylvanasSun/6ab31dcfd9670f29a46917decdba36d1">https://gist.github.com/SylvanasSun/6ab31dcfd9670f29a46917decdba36d1</a></p>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/10/08/2017-10-08-BrowserRenderOptimization/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/10/08/2017-10-08-BrowserRenderOptimization/" class="post-title-link" itemprop="url">浏览器性能优化-渲染性能</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-10-08 12:00:00" itemprop="dateCreated datePublished" datetime="2017-10-08T12:00:00+08:00">2017-10-08</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%89%8D%E7%AB%AF/" itemprop="url" rel="index"><span itemprop="name">前端</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%89%8D%E7%AB%AF/%E6%B5%8F%E8%A7%88%E5%99%A8/" itemprop="url" rel="index"><span itemprop="name">浏览器</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <p>在<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/10/03/2017-10-03-BrowserCriticalRenderingPath/">浏览器渲染过程与性能优化</a>一文中（建议先去看一下这篇文章再来阅读本文），我们了解与认识了浏览器的关键渲染路径以及如何优化页面的加载速度。在本文中，我们主要关注的是如何提高浏览器的渲染性能（浏览器进行布局计算、绘制像素等操作）与效率。</p>
<p>很多网页都使用了看起来效果非常酷炫的动画与用户进行交互，这些动画效果显著提高了用户的体验，但如果因为性能原因导致动画的每秒帧数太低，反而会让用户体验变得更差（如果一个酷炫的动画效果运行起来总是经常卡顿或者看起来反应很慢，这些都会让用户感觉糟透了）。</p>
<p>一个流畅的动画需要保持在每秒60帧，换算成毫秒浏览器需要在10毫秒左右完成渲染任务（每秒有1000毫秒，1000/60 约等于 16毫秒一帧，但浏览器还有其他工作需要占用时间，所以估算为10毫秒），如果能够理解浏览器的渲染过程并发现性能瓶颈对其优化，可以使你的项目变得具有交互性且动画效果如飘柔般顺滑。</p>
<blockquote>
<p>本文作者为: <a target="_blank" rel="noopener" href="https://github.com/SylvanasSun">SylvanasSun(sylvanas.sun@gmail.com)</a>.转载请务必将本段话置于文章开头处(保留超链接).<br>本文首发自<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun Blog</a>,原文链接: <a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/10/08/2017-10-08-BrowserRenderOptimization/">https://sylvanassun.github.io/2017/10/08/2017-10-08-BrowserRenderOptimization/</a></p>
</blockquote>
<h3 id="像素管道"><a href="#像素管道" class="headerlink" title="像素管道"></a>像素管道</h3><hr>
<p><strong>所谓像素管道其实就是浏览器将渲染树绘制成像素的流程。管道的每个区域都有可能产生卡顿，即管道中的某一区域如果发生变化，浏览器将会进行自动重排，然后重新绘制受影响的区域。</strong></p>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fk9uu5zk8xj20o603zdh7.jpg" alt="像素管道"></p>
<ul>
<li><p>JavaScript：<strong>该区域其实指的是实现动画效果的方法</strong>，一般使用<code>JavaScript</code>来实现动画，例如<code>JQuery</code>的<code>animate</code>函数、对一个数据集进行排序或动态添加一些<code>DOM</code>节点等。当然，也可以使用其他的方法来实现动画效果，像<code>CSS</code>的<code>Animation</code>、<code>Transition</code>和<code>Transform</code>。</p>
</li>
<li><p>Style：<strong>该区域为样式计算阶段，浏览器会根据选择器（就是<code>CSS</code>选择器，如<code>.td</code>）计算出哪些节点应用哪些<code>CSS</code>规则，然后计算出每个节点的最终样式并应用到节点上。</strong></p>
</li>
<li><p>Layout：<strong>该区域为布局计算阶段，浏览器会在该过程中根据节点的样式规则来计算它要占据的空间大小以及在屏幕中的位置</strong>。</p>
</li>
<li><p>Paint：<strong>该区域为绘制阶段，浏览器会先创建绘图调用的列表，然后填充像素</strong>。绘制阶段会涉及到文本、颜色、图像、边框和阴影，基本上包括了每个可视部分。绘制一般是在多个图层（用过<code>Photoshop</code>等图片编辑软件的童鞋一定很眼熟图层这个词，这里的图层的含义其实是差不多的）上完成的。</p>
</li>
<li><p>Composite：<strong>该区域为合成阶段，浏览器将多个图层按照正确顺序绘制到屏幕上。</strong></p>
</li>
</ul>
<p>假设我们修改了一个几何属性（例如宽度、高度等影响布局的属性），这时Layout阶段受到了影响，浏览器必须检查所有其他区域的元素，然后自动重排页面，任何受到影响的部分都需要重新绘制，并且最终绘制的元素还需要重新进行合成（简单地说就是整个像素管道都要重新执行一遍）。</p>
<p>如果我们只修改了不会影响页面布局的属性，例如背景图片、文字颜色等，那么浏览器会跳过布局阶段，但仍需要重新绘制。</p>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fk9uu6h7edj20nr03wq46.jpg"></p>
<p>又或者，我们只修改了一个不影响布局也不影响绘制的属性，那么浏览器将跳过布局与绘制阶段，显然这种改动是性能开销最小的。</p>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fk9uu6u2u3j20o5040wfm.jpg"></p>
<p>如果想要知道每个<code>CSS</code>属性将会对哪个阶段产生怎样的影响，请去<a target="_blank" rel="noopener" href="https://csstriggers.com/">CSS Triggers</a>，该网站详细地说明了每个<code>CSS</code>属性会影响到哪个阶段。</p>
<h3 id="使用RequestAnimationFrame函数实现动画"><a href="#使用RequestAnimationFrame函数实现动画" class="headerlink" title="使用RequestAnimationFrame函数实现动画"></a>使用RequestAnimationFrame函数实现动画</h3><hr>
<p>我们经常使用<code>JavaScript</code>来实现动画效果，然而时机不当或长时间运行的<code>JavaScript</code>可能就是导致你性能下降的原因。</p>
<p>避免使用<code>setTimeout()</code>或者<code>setInterval()</code>函数来实现动画效果，这种做法的主要问题是<strong>回调将会在帧中的某个时间点运行，这可能会刚好在末尾（会丢失帧导致发生卡顿）。</strong></p>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fka0wxn8m2j20p40by40d.jpg"></p>
<p>有些第三方库仍在使用<code>setTimeout()&amp;setInterval()</code>函数来实现动画效果，这会产生很多不必要的性能下降，例如老版本的<code>JQuery</code>，如果你使用的是<code>JQuery3</code>，那么不必为此担心，<code>JQuery3</code>已经全面改写了动画模块，采用了<code>requestAnimationFrame()</code>函数来实现动画效果。但如果你使用的是之前版本的<code>JQuery</code>，那么就需要<a target="_blank" rel="noopener" href="https://github.com/gnarf/jquery-requestAnimationFrame">jquery-requestAnimationFrame</a>来将<code>setTimeout()</code>替换为<code>requestAnimationFrame()</code>函数。</p>
<p>读到这里，想必一定会对<code>requestAnimationFrame()</code>产生好奇。要想得到一个流畅的动画，我们希望让视觉变化发生在每一帧的开头，而保证<code>JavaScript</code>在帧开始时运行的方式则是使用<code>requestAnimationFrame()</code>函数，本质上它与<code>setTimeout()</code>没有什么区别，都是在递归调用同一个回调函数来不断更新画面以达到动画的效果，<code>requestAnimationFrame()</code>的使用方法如下：</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">function</span> <span class="title">updateScreen</span>(<span class="params">time</span>) </span>&#123;</span><br><span class="line">	<span class="comment">// 这是你的动画效果函数</span></span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="comment">// 将你的动画效果函数放入requestAnimationFrame()作为回调函数</span></span><br><span class="line">requestAnimationFrame(updateScreen);</span><br></pre></td></tr></table></figure>
<p>并不是所有浏览器都支持<code>requestAnimationFrame()</code>函数，如<code>IE9</code>（又是万恶的<code>IE</code>），但基本上现代浏览器都会支持这个功能的，如果你需要兼容老旧版本的浏览器，可以使用以下函数。</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">// 本段代码截取自Paul Irish : https://gist.github.com/paulirish/1579671</span></span><br><span class="line">(<span class="function"><span class="keyword">function</span>(<span class="params"></span>) </span>&#123;</span><br><span class="line">    <span class="keyword">var</span> lastTime = <span class="number">0</span>;</span><br><span class="line">    <span class="keyword">var</span> vendors = [<span class="string">&#x27;ms&#x27;</span>, <span class="string">&#x27;moz&#x27;</span>, <span class="string">&#x27;webkit&#x27;</span>, <span class="string">&#x27;o&#x27;</span>];</span><br><span class="line">    <span class="keyword">for</span>(<span class="keyword">var</span> x = <span class="number">0</span>; x &lt; vendors.length &amp;&amp; !<span class="built_in">window</span>.requestAnimationFrame; ++x) &#123;</span><br><span class="line">        <span class="built_in">window</span>.requestAnimationFrame = <span class="built_in">window</span>[vendors[x]+<span class="string">&#x27;RequestAnimationFrame&#x27;</span>];</span><br><span class="line">        <span class="built_in">window</span>.cancelAnimationFrame = <span class="built_in">window</span>[vendors[x]+<span class="string">&#x27;CancelAnimationFrame&#x27;</span>] </span><br><span class="line">                                   || <span class="built_in">window</span>[vendors[x]+<span class="string">&#x27;CancelRequestAnimationFrame&#x27;</span>];</span><br><span class="line">    &#125;</span><br><span class="line"> 	</span><br><span class="line">	<span class="comment">// 如果浏览器不支持，则使用setTimeout()</span></span><br><span class="line">    <span class="keyword">if</span> (!<span class="built_in">window</span>.requestAnimationFrame)</span><br><span class="line">        <span class="built_in">window</span>.requestAnimationFrame = <span class="function"><span class="keyword">function</span>(<span class="params">callback, element</span>) </span>&#123;</span><br><span class="line">            <span class="keyword">var</span> currTime = <span class="keyword">new</span> <span class="built_in">Date</span>().getTime();</span><br><span class="line">            <span class="keyword">var</span> timeToCall = <span class="built_in">Math</span>.max(<span class="number">0</span>, <span class="number">16</span> - (currTime - lastTime));</span><br><span class="line">            <span class="keyword">var</span> id = <span class="built_in">window</span>.setTimeout(<span class="function"><span class="keyword">function</span>(<span class="params"></span>) </span>&#123; callback(currTime + timeToCall); &#125;, </span><br><span class="line">              timeToCall);</span><br><span class="line">            lastTime = currTime + timeToCall;</span><br><span class="line">            <span class="keyword">return</span> id;</span><br><span class="line">        &#125;;</span><br><span class="line"> </span><br><span class="line">    <span class="keyword">if</span> (!<span class="built_in">window</span>.cancelAnimationFrame)</span><br><span class="line">        <span class="built_in">window</span>.cancelAnimationFrame = <span class="function"><span class="keyword">function</span>(<span class="params">id</span>) </span>&#123;</span><br><span class="line">            <span class="built_in">clearTimeout</span>(id);</span><br><span class="line">        &#125;;</span><br><span class="line">&#125;());</span><br></pre></td></tr></table></figure>
<h3 id="Web-Workers"><a href="#Web-Workers" class="headerlink" title="Web Workers"></a>Web Workers</h3><hr>
<p><strong>我们知道<code>JavaScript</code>是单线程的，但浏览器可不是单线程的</strong>。**<code>JavaScript</code>在浏览器的主线程上运行<strong>，这恰好与样式计算、布局等许多其他情况下的渲染操作一起运行，</strong>如果<code>JavaScript</code>的运行时间过长，就会阻塞这些后续工作，导致帧丢失。**</p>
<p>使用<code>Chrome</code>开发者工具的<code>Timeline</code>功能可以帮助我们查看每个<code>JavaScript</code>脚本的运行时间（包括子脚本），帮助我们发现并突破性能瓶颈。</p>
<p><img src="http://wx3.sinaimg.cn/large/63503acbly1fka16g0tbyj20tq0lqacb.jpg" alt="数据采自掘金"></p>
<p>在找到影响性能的<code>JavaScript</code>脚本后，我们可以通过<code>Web Workers</code>进行优化。<code>Web Workers</code>是<code>HTML5</code>提出的一个标准，它<strong>可以让<code>JavaScript</code>脚本运行在后台线程（类似于创建一个子线程），而后台线程不会影响到主线程中的页面</strong>。不过，<strong>使用<code>Web Workers</code>创建的线程是不能操作<code>DOM</code>树的</strong>（这也是<code>Web Workers</code>没有颠覆<code>JavaScript</code>是单线程的原因，<code>JavaScript</code>之所以一直是单线程设计主要也是因为为了避免多个脚本操作<code>DOM</code>树的同步问题，这会提高很多复杂性），所以它只适合于做一些纯计算的工作（数据的排序、遍历等）。</p>
<p>如果你的<code>JavaScript</code>必须要在主线程中执行，那么只能选择另一种方法。将一个大任务分割为多个小任务（每个占用时间不超过几毫秒），并且在每帧的<code>requestAnimationFrame()</code>函数中运行：</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">var</span> taskList = breakBigTaskIntoMicroTasks(monsterTaskList);</span><br><span class="line">requestAnimationFrame(processTaskList);</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">function</span> <span class="title">processTaskList</span>(<span class="params">taskStartTime</span>) </span>&#123;</span><br><span class="line">  <span class="keyword">var</span> taskFinishTime;</span><br><span class="line"></span><br><span class="line">  <span class="keyword">do</span> &#123;</span><br><span class="line">    <span class="comment">// 从列表中弹出任务</span></span><br><span class="line">    <span class="keyword">var</span> nextTask = taskList.pop();</span><br><span class="line"></span><br><span class="line">    <span class="comment">// 执行任务</span></span><br><span class="line">    processTask(nextTask);</span><br><span class="line"></span><br><span class="line">    <span class="comment">// 如果有足够的时间进行下一个任务则继续执行</span></span><br><span class="line">    taskFinishTime = <span class="built_in">window</span>.performance.now();</span><br><span class="line">  &#125; <span class="keyword">while</span> (taskFinishTime - taskStartTime &lt; <span class="number">3</span>);</span><br><span class="line"></span><br><span class="line">  <span class="keyword">if</span> (taskList.length &gt; <span class="number">0</span>)</span><br><span class="line">    requestAnimationFrame(processTaskList);</span><br><span class="line"></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>创建一个<code>Web Workers</code>对象很简单，只需要调用<code>Worker()</code>构造器，然后传入指定脚本的<code>URI</code>。现代主流浏览器均支持<code>Web Workers</code>，除了<code>Internet Explorer</code>（又是万恶的IE），所以我们在下面的示例代码中还需要检测浏览器是否兼容。</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">var</span> myWorker;</span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> (<span class="keyword">typeof</span>(Worker) !== <span class="string">&quot;undefined&quot;</span>) &#123;</span><br><span class="line">	<span class="comment">// 支持Web Workers</span></span><br><span class="line">	myWorker = <span class="keyword">new</span> Worker(<span class="string">&quot;worker.js&quot;</span>);</span><br><span class="line">&#125; <span class="keyword">else</span> &#123;</span><br><span class="line">	<span class="comment">// 不支持Web Workers</span></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p><strong><code>Web Workers</code>与主线程之间通过<code>postMessage()</code>函数来发送信息，使用<code>onmessage()</code>事件处理函数来响应消息（主线程与子线程之间并没有共享数据，只是通过复制数据来交互）。</strong></p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br></pre></td><td class="code"><pre><span class="line">main.js: </span><br><span class="line"><span class="comment">// 在主线程js中发送数据到myWorker绑定的js脚本线程</span></span><br><span class="line">myWorker.postMessage(<span class="string">&quot;Hello,World&quot;</span>);</span><br><span class="line"><span class="built_in">console</span>.log(<span class="string">&#x27;Message posted to worker&#x27;</span>);</span><br><span class="line"> </span><br><span class="line">worker.js:</span><br><span class="line"><span class="comment">// onmessage处理函数允许我们在任何时刻，</span></span><br><span class="line"><span class="comment">// 一旦接收到消息就可以执行一些代码，代码中消息本身作为事件的data属性进行使用。</span></span><br><span class="line">onmessage = <span class="function"><span class="keyword">function</span>(<span class="params">data</span>) </span>&#123;</span><br><span class="line">    <span class="built_in">console</span>.log(<span class="string">&quot;Message received from main script.&quot;</span>);</span><br><span class="line">	<span class="built_in">console</span>.log(<span class="string">&quot;Posting message back to main script.&quot;</span>);</span><br><span class="line">	postMessage(<span class="string">&quot;Hello~&quot;</span>);</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line">main.js:</span><br><span class="line"><span class="comment">// 主线程使用onmessage接收消息</span></span><br><span class="line">myWorker.onmessage = <span class="function"><span class="keyword">function</span>(<span class="params">data</span>) </span>&#123;</span><br><span class="line">    <span class="built_in">console</span>.log(<span class="string">&quot;Received message: &quot;</span> + data);</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>如果你需要从主线程中立刻终止一个运行中的worker，可以调用worker的<code>terminate()</code>函数：</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">myWorker.terminate();</span><br></pre></td></tr></table></figure>
<p>myWorker会被立即杀死，不会有任何机会让它继续完成剩下的工作。而在worker线程中也可以调用<code>close()</code>函数进行关闭：</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">close();</span><br></pre></td></tr></table></figure>
<p>有关更多的<code>Web Workers</code>使用方法，请参考<a target="_blank" rel="noopener" href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Using_web_workers">Using Web Workers - Web APIs | MDN</a>。</p>
<h3 id="降低样式计算的复杂度"><a href="#降低样式计算的复杂度" class="headerlink" title="降低样式计算的复杂度"></a>降低样式计算的复杂度</h3><hr>
<p><strong>每次修改<code>DOM</code>和<code>CSS</code>都会导致浏览器重新计算样式</strong>，在很多情况下还会对页面或页面的一部分重新进行布局计算。</p>
<p>计算样式的第一部分是创建一组匹配选择器（用于计算哪些节点应用哪些样式），第二部分涉及从匹配选择器中获取所有样式规则，并计算出节点的最终样式。</p>
<p><strong>通过降低选择器的复杂性可以提升样式计算的速度。</strong></p>
<p>下面是一个复杂的<code>CSS</code>选择器：</p>
<figure class="highlight css"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="selector-class">.box</span><span class="selector-pseudo">:nth-last-child(-n+1)</span> <span class="selector-class">.title</span> &#123;</span><br><span class="line">  <span class="comment">/* styles */</span></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>浏览器如果想要找到应用该样式的节点，需要先找到有<code>.title</code>类的节点，然后其父节点正好是负n个子元素+1个带<code>.box</code>类的节点。浏览器计算此结果可能需要大量的时间，但我们可以把选择器的预期行为更改为一个类：</p>
<figure class="highlight css"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="selector-class">.final-box-title</span> &#123;</span><br><span class="line">  <span class="comment">/* styles */</span></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>我们只是将<code>CSS</code>的命名模块化（降低选择器的复杂性），然后只让浏览器简单地将选择器与节点进行匹配，这样浏览器计算样式的效率会提升许多。</p>
<p><code>BEM</code>是一种模块化的<code>CSS</code>命名规范，使用这种方法组织<code>CSS</code>不仅结构上十分清晰，也对浏览器的样式查找提供了帮助。</p>
<p><code>BEM</code>其实就是<code>Block,Element,Modifier</code>，它是一种基于组件的开发方式，其背后的思想就是将用户界面划分为独立的块。这样即使是使用复杂的<code>UI</code>也可以轻松快速地开发，并且模块化的方式可以提高代码的复用性。</p>
<p><strong><code>Block</code>是一个功能独立的页面组件（可以被重用），<code>Block</code>的命名方式就像写<code>Class</code>名一样</strong>。如下面的<code>.button</code>就是代表<code>&lt;button&gt;</code>的<code>Block</code>。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">.button &#123;</span><br><span class="line">    background-color: red;</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line">&lt;button class&#x3D;&quot;button&quot;&gt;I&#39;m a button&lt;&#x2F;button&gt;</span><br></pre></td></tr></table></figure>
<p><strong><code>Element</code>是一个不能单独使用的<code>Block</code>的复合部分</strong>。可以认为<code>Element</code>是<code>Block</code>的子节点。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line">&lt;!-- &#96;search-form&#96;是一个block --&gt;</span><br><span class="line">&lt;form class&#x3D;&quot;search-form&quot;&gt;</span><br><span class="line">    &lt;!-- &#39;search-form__input&#39;是&#39;search-form&#39; block中的一个element --&gt;</span><br><span class="line">    &lt;input class&#x3D;&quot;search-form__input&quot;&gt;</span><br><span class="line"></span><br><span class="line">    &lt;!-- &#39;search-form__button&#39;是&#39;search-form&#39; block中的一个element  --&gt;</span><br><span class="line">    &lt;button class&#x3D;&quot;search-form__button&quot;&gt;Search&lt;&#x2F;button&gt;</span><br><span class="line">&lt;&#x2F;form&gt;</span><br></pre></td></tr></table></figure>
<p><strong><code>Modifier</code>是用于定义<code>Block</code>或<code>Element</code>的外观、状态或行为的实体</strong>。假设，我们有了一个新的需求，对<code>button</code>的背景颜色使用绿色，那么我们可以使用<code>Modifier</code>对<code>.button</code>进行一次扩展：</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">.button &#123;</span><br><span class="line">    background-color: red;</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line">.button--secondary &#123;</span><br><span class="line">    background-color: green;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>第一次接触<code>BEM</code>的童鞋可能会对这种命名方式感到奇怪，但<code>BEM</code>重要的是模块化与可维护性的思想，至于命名完全可以按照你所能接受的方式修改。限于篇幅，本文就不再继续探讨<code>BEM</code>了，感兴趣的童鞋可以去看<a target="_blank" rel="noopener" href="https://en.bem.info/methodology/quick-start/">BEM的官方文档</a>。</p>
<h3 id="避免强制同步布局和布局抖动"><a href="#避免强制同步布局和布局抖动" class="headerlink" title="避免强制同步布局和布局抖动"></a>避免强制同步布局和布局抖动</h3><hr>
<p><strong>浏览器每次进行布局计算时几乎总是会作用到整个<code>DOM</code>，如果有大量元素，那么将会需要很长时间才能计算出所有元素的位置与尺寸。</strong></p>
<p>所以我们<strong>应当尽量避免在运行时动态地修改几何属性</strong>（宽度、高度等），因为这些改动都会导致浏览器重新进行布局计算。如果无法避免，那么要**优先使用<code>Flexbox</code>**，它会尽量减少布局所需的开销。</p>
<p><strong>强制同步布局就是使用<code>JavaScript</code>强制浏览器提前执行布局</strong>。需要先明白一点，<strong>在<code>JavaScript</code>运行时，来自上一帧的所有旧布局值都是已知的。</strong></p>
<p>以下代码为例，它在每一帧的开头输出了元素的高度：</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">requestAnimationFrame(logBoxHeight);</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">function</span> <span class="title">logBoxHeight</span>(<span class="params"></span>) </span>&#123;</span><br><span class="line">  <span class="built_in">console</span>.log(box.offsetHeight);</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>但如果在请求高度之前，修改了其样式，就会出现问题，浏览器必须先应用样式，然后进行布局计算，之后才能返回正确的高度。这是不必要的且会产生非常大的开销。</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">function</span> <span class="title">logBoxHeight</span>(<span class="params"></span>) </span>&#123;</span><br><span class="line">  box.classList.add(<span class="string">&#x27;super-big&#x27;</span>);</span><br><span class="line">  </span><br><span class="line">  <span class="built_in">console</span>.log(box.offsetHeight);</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>正确的做法，应该利用浏览器可以使用上一帧布局值的特性，然后再执行任何写操作：</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">function</span> <span class="title">logBoxHeight</span>(<span class="params"></span>) </span>&#123;</span><br><span class="line">  <span class="built_in">console</span>.log(box.offsetHeight);</span><br><span class="line"></span><br><span class="line">  box.classList.add(<span class="string">&#x27;super-big&#x27;</span>);</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p><strong>如果接二连三地发生强制同步布局，那么就会产生布局抖动</strong>。以下代码循环处理一组段落，并设置每个段落的宽度以匹配一个名为“box”的元素的宽度。</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">function</span> <span class="title">resizeAllParagraphsToMatchBlockWidth</span>(<span class="params"></span>) </span>&#123;</span><br><span class="line">  <span class="keyword">for</span> (<span class="keyword">var</span> i = <span class="number">0</span>; i &lt; paragraphs.length; i++) &#123;</span><br><span class="line">    paragraphs[i].style.width = box.offsetWidth + <span class="string">&#x27;px&#x27;</span>;</span><br><span class="line">  &#125;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>这段代码的问题在于每次迭代都会读取<code>box.offsetWidth</code>，然后立即使用此值来更新段落的宽度。在循环的下次迭代中，浏览器必须考虑样式更新这一事实（<code>box.offsetWidth</code>是在上一次迭代中请求的），因此它必须应用样式更改，然后执行布局。这会导致每次迭代都会产生强制同步布局，正确的做法应该先读取值，然后再写入值。</p>
<figure class="highlight javascript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">// Read.</span></span><br><span class="line"><span class="keyword">var</span> width = box.offsetWidth;</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">function</span> <span class="title">resizeAllParagraphsToMatchBlockWidth</span>(<span class="params"></span>) </span>&#123;</span><br><span class="line">  <span class="keyword">for</span> (<span class="keyword">var</span> i = <span class="number">0</span>; i &lt; paragraphs.length; i++) &#123;</span><br><span class="line">    <span class="comment">// Now write.</span></span><br><span class="line">    paragraphs[i].style.width = width + <span class="string">&#x27;px&#x27;</span>;</span><br><span class="line">  &#125;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>要想轻松地解决这个问题，可以使用<a target="_blank" rel="noopener" href="https://github.com/wilsonpage/fastdom">FastDOM</a>进行批量读取与写入，它可以防止强制布局同步与布局抖动。</p>
<h3 id="使用不会触发布局与绘制的属性来实现动画"><a href="#使用不会触发布局与绘制的属性来实现动画" class="headerlink" title="使用不会触发布局与绘制的属性来实现动画"></a>使用不会触发布局与绘制的属性来实现动画</h3><hr>
<p>在像素管道一节中，我们发现有种属性修改后会跳过布局与绘制阶段，这显然会减少不少性能开销。目前只有两种属性符合这个条件：<code>transform</code>和<code>opacity</code> 。</p>
<p>需要注意的是，使用<code>transform</code>和<code>opacity</code>时，更改这些属性所在的元素应处于其自身的图层，<strong>所以我们需要将设置动画的元素单独新建一个图层（这样做的好处是该图层上的重绘可以在不影响其他图层上元素的情况下进行处理。如果你用过<code>Photoshop</code>，想必能够理解多图层工作的方便之处）。</strong></p>
<p>创建新图层的最佳方式是使用<code>will-change</code>属性，<strong>该属性告知浏览器该元素会有哪些变化，这样浏览器可以在元素属性真正发生变化之前提前做好对应的优化准备工作。</strong></p>
<figure class="highlight css"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="selector-class">.moving-element</span> &#123;</span><br><span class="line">  <span class="attribute">will-change</span>: transform;</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line">// 对于不支持 <span class="selector-tag">will-change</span> 但受益于层创建的浏览器，需要使用（滥用）3<span class="selector-tag">D</span> 变形来强制创建一个新层</span><br><span class="line"><span class="selector-class">.moving-element</span> &#123;</span><br><span class="line">  <span class="attribute">transform</span>: <span class="built_in">translateZ</span>(<span class="number">0</span>);</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p><strong>但不要认为<code>will-change</code>可以提高性能就随便滥用，使用<code>will-change</code>进行预优化与创建图层都需要额外的内存和管理开销，随便滥用只会得不偿失。</strong></p>
<h3 id="参考文献"><a href="#参考文献" class="headerlink" title="参考文献"></a>参考文献</h3><hr>
<ul>
<li><p><a target="_blank" rel="noopener" href="https://developers.google.com/web/">Web   |  Google Developers</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Using_web_workers">Using Web Workers - Web APIs | MDN</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://developer.mozilla.org/en-US/docs/Web/CSS/will-change">will-change - CSS | MDN</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://en.bem.info/methodology/quick-start/">Quick start / Methodology / BEM</a></p>
</li>
</ul>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/10/03/2017-10-03-BrowserCriticalRenderingPath/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/10/03/2017-10-03-BrowserCriticalRenderingPath/" class="post-title-link" itemprop="url">浏览器渲染过程与性能优化</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-10-03 18:00:00" itemprop="dateCreated datePublished" datetime="2017-10-03T18:00:00+08:00">2017-10-03</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%89%8D%E7%AB%AF/" itemprop="url" rel="index"><span itemprop="name">前端</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%89%8D%E7%AB%AF/%E6%B5%8F%E8%A7%88%E5%99%A8/" itemprop="url" rel="index"><span itemprop="name">浏览器</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <p>大家都知道万维网的应用层使用了<code>HTTP</code>协议，并且用浏览器作为入口访问网络上的资源。用户在使用浏览器访问一个网站时需要先通过<code>HTTP</code>协议向服务器发送请求，之后服务器返回<code>HTML</code>文件与响应信息。这时，浏览器会根据<code>HTML</code>文件来进行解析与渲染（该阶段还包括向服务器请求非内联的<code>CSS</code>文件与<code>JavaScript</code>文件或者其他资源），最终再将页面呈现在用户面前。</p>
<p>现在知道了网页的渲染都是由浏览器完成的，那么如果一个网站的页面加载速度太慢会导致用户体验不够友好，本文通过详解浏览器渲染页面的过程来引入一些基本的浏览器性能优化方案。让浏览器更快地渲染你的网页并快速响应从而提高用户体验。</p>
<blockquote>
<p>本文作者为: <a target="_blank" rel="noopener" href="https://github.com/SylvanasSun">SylvanasSun(sylvanas.sun@gmail.com)</a>.转载请务必将下面这段话置于文章开头处(保留超链接).<br>本文首发自<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun Blog</a>,原文链接: <a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/10/03/2017-10-03-BrowserCriticalRenderingPath">https://sylvanassun.github.io/2017/10/03/2017-10-03-BrowserCriticalRenderingPath</a></p>
</blockquote>
<h3 id="关键渲染路径"><a href="#关键渲染路径" class="headerlink" title="关键渲染路径"></a>关键渲染路径</h3><hr>
<p><strong>浏览器接收到服务器返回的<code>HTML</code>、<code>CSS</code>和<code>JavaScript</code>字节数据并对其进行解析和转变成像素的渲染过程被称为关键渲染路径。通过优化关键渲染路径即可以缩短浏览器渲染页面的时间。</strong></p>
<p><strong>浏览器在渲染页面前需要先构建出<code>DOM</code>树与<code>CSSOM</code>树</strong>（如果没有<code>DOM</code>树和<code>CSSOM</code>树就无法确定页面的结构与样式，所以这两项是必须先构建出来的）。</p>
<p><strong><code>DOM</code>树全称为<code>Document Object Model</code>文档对象模型，它是<code>HTML</code>和<code>XML</code>文档的编程接口，提供了对文档的结构化表示，并定义了一种可以使程序对该结构进行访问的方式</strong>（比如<code>JavaScript</code>就是通过<code>DOM</code>来操作结构、样式和内容）。<code>DOM</code>将文档解析为一个由节点和对象组成的集合，可以说一个<code>WEB</code>页面其实就是一个<code>DOM</code>。</p>
<p><code>CSSOM</code>树全称为<code>Cascading Style Sheets Object Model</code>层叠样式表对象模型，它与<code>DOM</code>树的含义相差不大，只不过它是<code>CSS</code>的对象集合。</p>
<h3 id="构建DOM树与CSSOM树"><a href="#构建DOM树与CSSOM树" class="headerlink" title="构建DOM树与CSSOM树"></a>构建DOM树与CSSOM树</h3><hr>
<p>浏览器从网络或硬盘中获得<code>HTML</code>字节数据后会经过一个流程将字节解析为<code>DOM</code>树：</p>
<ul>
<li><p>编码： <strong>先将<code>HTML</code>的原始字节数据转换为文件指定编码的字符。</strong></p>
</li>
<li><p>令牌化： 然后<strong>浏览器会根据<code>HTML</code>规范来将字符串转换成各种令牌</strong>（如<code>&lt;html&gt;</code>、<code>&lt;body&gt;</code>这样的标签以及标签中的字符串和属性等都会被转化为令牌，每个令牌具有特殊含义和一组规则）。令牌记录了标签的开始与结束，通过这个特性可以轻松判断一个标签是否为子标签（假设有<code>&lt;html&gt;</code>与<code>&lt;body&gt;</code>两个标签，当<code>&lt;html&gt;</code>标签的令牌还未遇到它的结束令牌<code>&lt;/html&gt;</code>就遇见了<code>&lt;body&gt;</code>标签令牌，那么<code>&lt;body&gt;</code>就是<code>&lt;html&gt;</code>的子标签）。</p>
</li>
<li><p>生成对象： <strong>接下来每个令牌都会被转换成定义其属性和规则的对象（这个对象就是节点对象）。</strong></p>
</li>
<li><p>构建完毕： <strong><code>DOM</code>树构建完成，整个对象集合就像是一棵树形结构</strong>。可能有人会疑惑为什么<code>DOM</code>是一个树形结构，这是因为标签之间含有复杂的父子关系，树形结构正好可以诠释这个关系（<code>CSSOS</code>同理，层叠样式也含有父子关系。例如： <code>div p &#123;font-size: 18px&#125;</code>，会先寻找所有<code>p</code>标签并判断它的父标签是否为<code>div</code>之后才会决定要不要采用这个样式进行渲染）。</p>
</li>
</ul>
<p>整个<code>DOM</code>树的构建过程其实就是： <strong>字节 -&gt; 字符 -&gt; 令牌 -&gt; 节点对象 -&gt; 对象模型</strong>，下面将通过一个示例<code>HTML</code>代码与配图更形象地解释这个过程。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">meta</span> <span class="attr">name</span>=<span class="string">&quot;viewport&quot;</span> <span class="attr">content</span>=<span class="string">&quot;width=device-width,initial-scale=1&quot;</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;style.css&quot;</span> <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">title</span>&gt;</span>Critical Path<span class="tag">&lt;/<span class="name">title</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">p</span>&gt;</span>Hello <span class="tag">&lt;<span class="name">span</span>&gt;</span>web performance<span class="tag">&lt;/<span class="name">span</span>&gt;</span> students!<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">div</span>&gt;</span><span class="tag">&lt;<span class="name">img</span> <span class="attr">src</span>=<span class="string">&quot;awesome-photo.jpg&quot;</span>&gt;</span><span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">html</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fk2te7tuh0j20nv0dbjtl.jpg" alt="DOM树构建过程"></p>
<p>当上述<code>HTML</code>代码遇见<code>&lt;link&gt;</code>标签时，浏览器会发送请求获得该标签中标记的<code>CSS</code>文件（使用内联<code>CSS</code>可以省略请求的步骤提高速度，但没有必要为了这点速度而丢失了模块化与可维护性），<code>style.css</code>中的内容如下：</p>
<figure class="highlight css"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="selector-tag">body</span> &#123; <span class="attribute">font-size</span>: <span class="number">16px</span> &#125;</span><br><span class="line"><span class="selector-tag">p</span> &#123; <span class="attribute">font-weight</span>: bold &#125;</span><br><span class="line"><span class="selector-tag">span</span> &#123; <span class="attribute">color</span>: red &#125;</span><br><span class="line"><span class="selector-tag">p</span> <span class="selector-tag">span</span> &#123; <span class="attribute">display</span>: none &#125;</span><br><span class="line"><span class="selector-tag">img</span> &#123; <span class="attribute">float</span>: right &#125;</span><br></pre></td></tr></table></figure>
<p>浏览器获得外部<code>CSS</code>文件的数据后，就会像构建<code>DOM</code>树一样开始构建<code>CSSOM</code>树，这个过程没有什么特别的差别。</p>
<p><img src="http://wx1.sinaimg.cn/large/63503acbly1fk2te87pafj20hi08mgm3.jpg" alt="CSSOM树"></p>
<p>如果想要更详细地去体验一下关键渲染路径的构建，可以使用<code>Chrome</code>开发者工具中的<code>Timeline</code>功能，它记录了浏览器从请求页面资源一直到渲染的各种操作过程，甚至还可以录制某一时间段的过程（建议不要去看太大的网站，信息会比较杂乱）。</p>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fk2tubeknuj20uf0e1abt.jpg" alt="Timeline"></p>
<h3 id="构建渲染树"><a href="#构建渲染树" class="headerlink" title="构建渲染树"></a>构建渲染树</h3><hr>
<p>在构建了<code>DOM</code>树和<code>CSSOM</code>树之后，浏览器只是拥有了两个互相独立的对象集合，<code>DOM</code>树描述了文档的结构与内容，<code>CSSOM</code>树则描述了对文档应用的样式规则，<strong>想要渲染出页面，就需要将<code>DOM</code>树与<code>CSSOM</code>树结合在一起</strong>，这就是渲染树。</p>
<p><img src="http://wx3.sinaimg.cn/large/63503acbly1fk2tubr7irj20oo0bmacf.jpg" alt="渲染树"></p>
<ul>
<li><p>浏览器会先从<code>DOM</code>树的根节点开始遍历每个可见节点（不可见的节点自然就没必要渲染到页面了，不可见的节点还包括被<code>CSS</code>设置了<code>display: none</code>属性的节点，值得注意的是<code>visibility: hidden</code>属性并不算是不可见属性，它的语义是隐藏元素，但元素仍然占据着布局空间，所以它会被渲染成一个空框）。</p>
</li>
<li><p>对每个可见节点，找到其适配的<code>CSS</code>样式规则并应用。</p>
</li>
<li><p>渲染树构建完成，每个节点都是可见节点并且都含有其内容和对应规则的样式。</p>
</li>
</ul>
<p>渲染树构建完毕后，浏览器得到了每个可见节点的内容与其样式，下一步工作则<strong>需要计算每个节点在窗口内的确切位置与大小，也就是布局阶段。</strong></p>
<p><code>CSS</code>采用了一种叫做盒子模型的思维模型来表示每个节点与其他元素之间的距离，盒子模型包括外边距(<code>Margin</code>)，内边距(<code>Padding</code>)，边框(<code>Border</code>)，内容(<code>Content</code>)。页面中的每个标签其实都是一个个盒子。</p>
<p><img src="http://wx1.sinaimg.cn/large/63503acbly1fk2vm967xsj206105aq2t.jpg" alt="盒子模型"></p>
<p><strong>布局阶段会从渲染树的根节点开始遍历，然后确定每个节点对象在页面上的确切大小与位置</strong>，布局阶段的输出是一个盒子模型，它会精确地捕获每个元素在屏幕内的确切位置与大小，所有相对的测量值也都会被转换为屏幕内的绝对像素值。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">meta</span> <span class="attr">name</span>=<span class="string">&quot;viewport&quot;</span> <span class="attr">content</span>=<span class="string">&quot;width=device-width,initial-scale=1&quot;</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">title</span>&gt;</span>Critial Path: Hello world!<span class="tag">&lt;/<span class="name">title</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">div</span> <span class="attr">style</span>=<span class="string">&quot;width: 50%&quot;</span>&gt;</span></span><br><span class="line">      <span class="tag">&lt;<span class="name">div</span> <span class="attr">style</span>=<span class="string">&quot;width: 50%&quot;</span>&gt;</span>Hello world!<span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">html</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><img src="http://wx3.sinaimg.cn/large/63503acbly1fk2vdt67wuj20hm08f3yl.jpg" alt="上述代码的布局结果"></p>
<p>当<code>Layout</code>布局事件完成后，浏览器会立即发出<code>Paint Setup</code>与<code>Paint</code>事件，开始将渲染树绘制成像素，绘制所需的时间跟<code>CSS</code>样式的复杂度成正比，绘制完成后，用户就可以看到页面的最终呈现效果了。</p>
<p>我们对一个网页发送请求并获得渲染后的页面可能也就经过了1~2秒，但浏览器其实已经做了上述所讲的非常多的工作，总结一下浏览器关键渲染路径的整个过程：</p>
<ul>
<li><p>处理<code>HTML</code>标记数据并生成<code>DOM</code>树。</p>
</li>
<li><p>处理<code>CSS</code>标记数据并生成<code>CSSOM</code>树。</p>
</li>
<li><p>将<code>DOM</code>树与<code>CSSOM</code>树合并在一起生成渲染树。</p>
</li>
<li><p>遍历渲染树开始布局，计算每个节点的位置信息。</p>
</li>
<li><p>将每个节点绘制到屏幕。</p>
</li>
</ul>
<h3 id="渲染阻塞的优化方案"><a href="#渲染阻塞的优化方案" class="headerlink" title="渲染阻塞的优化方案"></a>渲染阻塞的优化方案</h3><hr>
<p>浏览器想要渲染一个页面就必须先构建出<code>DOM</code>树与<code>CSSOM</code>树，如果<code>HTML</code>与<code>CSS</code>文件结构非常庞大与复杂，这显然会给页面加载速度带来严重影响。</p>
<p>所谓渲染阻塞资源，即是对该资源发送请求后还需要先构建对应的<code>DOM</code>树或<code>CSSOM</code>树，这种行为显然会延迟渲染操作的开始时间。**<code>HTML</code>、<code>CSS</code>、<code>JavaScript</code>都是会对渲染产生阻塞的资源，<code>HTML</code>是必需的（没有<code>DOM</code>还谈何渲染），但还可以从<code>CSS</code>与<code>JavaScript</code>着手优化，尽可能地减少阻塞的产生。**</p>
<h4 id="优化CSS"><a href="#优化CSS" class="headerlink" title="优化CSS"></a>优化CSS</h4><hr>
<p>如果可以让<code>CSS</code>资源只在特定条件下使用，这样这些资源就可以在首次加载时先不进行构建<code>CSSOM</code>树，只有在符合特定条件时，才会让浏览器进行阻塞渲染然后构建<code>CSSOM</code>树。</p>
<p><code>CSS</code>的媒体查询正是用来实现这个功能的，它由媒体类型以及零个或多个检查特定媒体特征状况的表达式组成。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">&lt;!-- 没有使用媒体查询，这个css资源会阻塞渲染  --&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;style.css&quot;</span>    <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span>&gt;</span></span><br><span class="line"><span class="comment">&lt;!-- all是默认类型，它和不设置媒体查询的效果是一样的 --&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;style.css&quot;</span>    <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span> <span class="attr">media</span>=<span class="string">&quot;all&quot;</span>&gt;</span></span><br><span class="line"><span class="comment">&lt;!-- 动态媒体查询， 将在网页加载时计算。</span></span><br><span class="line"><span class="comment">根据网页加载时设备的方向，portrait.css 可能阻塞渲染，也可能不阻塞渲染。--&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;portrait.css&quot;</span> <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span> <span class="attr">media</span>=<span class="string">&quot;orientation:portrait&quot;</span>&gt;</span></span><br><span class="line"><span class="comment">&lt;!-- 只在打印网页时应用，因此网页首次在浏览器中加载时，它不会阻塞渲染。 --&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;print.css&quot;</span>    <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span> <span class="attr">media</span>=<span class="string">&quot;print&quot;</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><strong>使用媒体查询可以让<code>CSS</code>资源不在首次加载中阻塞渲染，但不管是哪种<code>CSS</code>资源它们的下载请求都不会被忽略，浏览器仍然会先下载CSS文件</strong></p>
<h4 id="优化JavaScript"><a href="#优化JavaScript" class="headerlink" title="优化JavaScript"></a>优化JavaScript</h4><hr>
<p><strong>当浏览器的<code>HTML</code>解析器遇到一个<code>script</code>标记时会暂停构建<code>DOM</code>，然后将控制权移交至<code>JavaScript</code>引擎，这时引擎会开始执行<code>JavaScript</code>脚本，直到执行结束后，浏览器才会从之前中断的地方恢复，然后继续构建<code>DOM</code>。每次去执行<code>JavaScript</code>脚本都会严重地阻塞<code>DOM</code>树的构建，如果<code>JavaScript</code>脚本还操作了<code>CSSOM</code>，而正好这个<code>CSSOM</code>还没有下载和构建，浏览器甚至会延迟脚本执行和构建<code>DOM</code>，直至完成其<code>CSSOM</code>的下载和构建</strong>。显而易见，如果对<code>JavaScript</code>的执行位置运用不当，这将会严重影响渲染的速度。</p>
<p>下面代码中的<code>JavaScript</code>脚本并不会生效，这是因为<code>DOM</code>树还没有构建到<code>&lt;p&gt;</code>标签时，<code>JavaScript</code>脚本就已经开始执行了。这也是为什么经常有人在<code>HTML</code>文件的最下方写内联<code>JavaScript</code>代码，又或者使用<code>window.onload()</code>和<code>JQuery</code>中的<code>$(function()&#123;&#125;)</code>（这两个函数有一些区别，<code>window.onload()</code>是等待页面完全加载完毕后触发的事件，而<code>$(function()&#123;&#125;)</code>在<code>DOM</code>树构建完毕后就会执行）。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">meta</span> <span class="attr">name</span>=<span class="string">&quot;viewport&quot;</span> <span class="attr">content</span>=<span class="string">&quot;width=device-width,initial-scale=1&quot;</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;style.css&quot;</span> <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">title</span>&gt;</span>Hello,World<span class="tag">&lt;/<span class="name">title</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">script</span> <span class="attr">type</span>=<span class="string">&quot;text/javascript&quot;</span>&gt;</span></span><br><span class="line"><span class="javascript">    	<span class="keyword">var</span> p = <span class="built_in">document</span>.getElementsByTagName(<span class="string">&#x27;p&#x27;</span>)[<span class="number">0</span>];</span></span><br><span class="line"><span class="javascript">    	p.textContent = <span class="string">&#x27;SylvanasSun&#x27;</span>;	</span></span><br><span class="line">    <span class="tag">&lt;/<span class="name">script</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">p</span>&gt;</span>Hello,World!<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">html</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><strong>使用<code>async</code>可以通知浏览器该脚本不需要在引用位置执行</strong>，这样浏览器就可以继续构建<code>DOM</code>，<code>JavaScript</code>脚本会在就绪后开始执行，这样将显著提升页面首次加载的性能（<code>async</code>只可以在<code>src</code>标签中使用也就是外部引用的<code>JavaScript</code>文件）。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">&lt;!-- 下面2个用法效果是等价的 --&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">script</span> <span class="attr">type</span>=<span class="string">&quot;text/javascript&quot;</span> <span class="attr">src</span>=<span class="string">&quot;demo_async.js&quot;</span> <span class="attr">async</span>=<span class="string">&quot;async&quot;</span>&gt;</span><span class="tag">&lt;/<span class="name">script</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">script</span> <span class="attr">type</span>=<span class="string">&quot;text/javascript&quot;</span> <span class="attr">src</span>=<span class="string">&quot;demo_async.js&quot;</span> <span class="attr">async</span>&gt;</span><span class="tag">&lt;/<span class="name">script</span>&gt;</span></span><br></pre></td></tr></table></figure>

<h3 id="优化关键渲染路径总结"><a href="#优化关键渲染路径总结" class="headerlink" title="优化关键渲染路径总结"></a>优化关键渲染路径总结</h3><hr>
<p>上文已经完整讲述了浏览器是如何渲染页面的以及渲染之前的准备工作，接下来我们以下面的案例来总结一下优化关键渲染路径的方法。</p>
<p>假设有一个<code>HTML</code>页面，它只引入了一个<code>CSS</code>外部文件：</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">meta</span> <span class="attr">name</span>=<span class="string">&quot;viewport&quot;</span> <span class="attr">content</span>=<span class="string">&quot;width=device-width,initial-scale=1&quot;</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;style.css&quot;</span> <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">p</span>&gt;</span>Hello <span class="tag">&lt;<span class="name">span</span>&gt;</span>web performance<span class="tag">&lt;/<span class="name">span</span>&gt;</span> students!<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">div</span>&gt;</span><span class="tag">&lt;<span class="name">img</span> <span class="attr">src</span>=<span class="string">&quot;awesome-photo.jpg&quot;</span>&gt;</span><span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">html</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p>它的关键渲染路径如下：</p>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fk56dzdbn9j20nr068dga.jpg"></p>
<p>首先浏览器要先对服务器发送请求获得<code>HTML</code>文件，得到<code>HTML</code>文件后开始构建<code>DOM</code>树，在遇见<code>&lt;link&gt;</code>标签时浏览器需要向服务器再次发出请求来获得<code>CSS</code>文件，然后则是继续构建<code>DOM</code>树和<code>CSSOM</code>树，浏览器合并出渲染树，根据渲染树进行布局计算，执行绘制操作，页面渲染完成。</p>
<p>有以下几个用于描述关键渲染路径性能的词汇：</p>
<ul>
<li><p>关键资源：可能阻塞网页首次渲染的资源（上图中为2个，<code>HTML</code>文件与外部<code>CSS</code>文件<code>style.css</code>）。</p>
</li>
<li><p>关键路径长度： 获取关键资源所需的往返次数或总时间（上图为2次或以上，一次获取<code>HTML</code>文件，一次获取<code>CSS</code>文件，这个次数基于<code>TCP</code>协议的最大拥塞窗口，一个文件不一定能在一次连接内传输完毕）。</p>
</li>
<li><p>关键字节：所有关键资源文件大小的总和（上图为<code>9KB</code>）。</p>
</li>
</ul>
<p>接下来，案例代码的需求发生了变化，它新增了一个<code>JavaScript</code>文件。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">meta</span> <span class="attr">name</span>=<span class="string">&quot;viewport&quot;</span> <span class="attr">content</span>=<span class="string">&quot;width=device-width,initial-scale=1&quot;</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;style.css&quot;</span> <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">p</span>&gt;</span>Hello <span class="tag">&lt;<span class="name">span</span>&gt;</span>web performance<span class="tag">&lt;/<span class="name">span</span>&gt;</span> students!<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">div</span>&gt;</span><span class="tag">&lt;<span class="name">img</span> <span class="attr">src</span>=<span class="string">&quot;awesome-photo.jpg&quot;</span>&gt;</span><span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">script</span> <span class="attr">src</span>=<span class="string">&quot;app.js&quot;</span>&gt;</span><span class="tag">&lt;/<span class="name">script</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">html</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><img src="http://wx1.sinaimg.cn/large/63503acbly1fk56e0i20xj20or06ygmi.jpg"></p>
<p><code>JavaScript</code>文件阻塞了<code>DOM</code>树的构建，并且在执行<code>JavaScript</code>脚本时还需要先等待构建<code>CSSOM</code>树，上图的关键渲染路径特性如下：</p>
<ul>
<li><p>关键资源： 3（<code>HTML</code>、<code>style.css</code>、<code>app.js</code>）</p>
</li>
<li><p>关键路径长度： 2或以上（浏览器会在一次连接中一起下载<code>style.css</code>和<code>app.js</code>）</p>
</li>
<li><p>关键字节：11KB</p>
</li>
</ul>
<p>现在，我们要优化关键渲染路径，首先将<code>&lt;script&gt;</code>标签添加异步属性<code>async</code>，这样浏览器的<code>HTML</code>解析器就不会阻塞这个<code>JavaScript</code>文件了。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">meta</span> <span class="attr">name</span>=<span class="string">&quot;viewport&quot;</span> <span class="attr">content</span>=<span class="string">&quot;width=device-width,initial-scale=1&quot;</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;style.css&quot;</span> <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">p</span>&gt;</span>Hello <span class="tag">&lt;<span class="name">span</span>&gt;</span>web performance<span class="tag">&lt;/<span class="name">span</span>&gt;</span> students!<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">div</span>&gt;</span><span class="tag">&lt;<span class="name">img</span> <span class="attr">src</span>=<span class="string">&quot;awesome-photo.jpg&quot;</span>&gt;</span><span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">script</span> <span class="attr">src</span>=<span class="string">&quot;app.js&quot;</span> <span class="attr">async</span>&gt;</span><span class="tag">&lt;/<span class="name">script</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">html</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><img src="http://wx3.sinaimg.cn/large/63503acbly1fk56e0sokqj20oj074ab1.jpg"></p>
<ul>
<li><p>关键资源：2（<code>app.js</code>为异步加载，不会成为阻塞渲染的资源）</p>
</li>
<li><p>关键路径长度： 2或以上</p>
</li>
<li><p>关键字节： 9KB（<code>app.js</code>不再是关键资源，所以没有算上它的大小）</p>
</li>
</ul>
<p>接下来对<code>CSS</code>进行优化，比如添加上媒体查询。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">meta</span> <span class="attr">name</span>=<span class="string">&quot;viewport&quot;</span> <span class="attr">content</span>=<span class="string">&quot;width=device-width,initial-scale=1&quot;</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">link</span> <span class="attr">href</span>=<span class="string">&quot;style.css&quot;</span> <span class="attr">rel</span>=<span class="string">&quot;stylesheet&quot;</span> <span class="attr">media</span>=<span class="string">&quot;print&quot;</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">p</span>&gt;</span>Hello <span class="tag">&lt;<span class="name">span</span>&gt;</span>web performance<span class="tag">&lt;/<span class="name">span</span>&gt;</span> students!<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">div</span>&gt;</span><span class="tag">&lt;<span class="name">img</span> <span class="attr">src</span>=<span class="string">&quot;awesome-photo.jpg&quot;</span>&gt;</span><span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">script</span> <span class="attr">src</span>=<span class="string">&quot;app.js&quot;</span> <span class="attr">async</span>&gt;</span><span class="tag">&lt;/<span class="name">script</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">html</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><img src="http://wx1.sinaimg.cn/large/63503acbly1fk56e15x4jj20ny082mxu.jpg"></p>
<ul>
<li><p>关键资源：1（<code>app.js</code>为异步加载，<code>style.css</code>只有在打印时才会使用，所以只剩下<code>HTML</code>一个关键资源，也就是说当<code>DOM</code>树构建完毕，浏览器就会开始进行渲染）</p>
</li>
<li><p>关键路径长度：1或以上</p>
</li>
<li><p>关键字节：5KB</p>
</li>
</ul>
<p><strong>优化关键渲染路径就是在对关键资源、关键路径长度和关键字节进行优化</strong>。关键资源越少，浏览器在渲染前的准备工作就越少；同样，关键路径长度和关键字节关系到浏览器下载资源的效率，它们越少，浏览器下载资源的速度就越快。</p>
<h3 id="其他优化方案"><a href="#其他优化方案" class="headerlink" title="其他优化方案"></a>其他优化方案</h3><hr>
<p>除了异步加载<code>JavaScript</code>和使用媒体查询外还有很多其他的优化方案可以使页面的首次加载变得更快，这些方案可以综合起来使用，但核心的思想还是针对关键渲染路径进行了优化。</p>
<h4 id="加载部分HTML"><a href="#加载部分HTML" class="headerlink" title="加载部分HTML"></a>加载部分HTML</h4><hr>
<p><strong>服务端在接收到请求时先只响应回<code>HTML</code>的初始部分，后续的<code>HTML</code>内容在需要时再通过<code>AJAX</code>获得</strong>。由于服务端只发送了部分<code>HTML</code>文件，这让构建<code>DOM</code>树的工作量减少很多，从而让用户感觉页面的加载速度很快。</p>
<p>注意，这个方法不能用在<code>CSS</code>上，浏览器不允许<code>CSSOM</code>只构建初始部分，否则会无法确定具体的样式。 </p>
<h4 id="压缩"><a href="#压缩" class="headerlink" title="压缩"></a>压缩</h4><hr>
<p><strong>通过对外部资源进行压缩可以大幅度地减少浏览器需要下载的资源量，它会减少关键路径长度与关键字节，使页面的加载速度变得更快。</strong></p>
<p><strong>对数据进行压缩其实就是使用更少的位数来对数据进行重编码</strong>。如今有非常多的压缩算法，且每一个的作用领域也各不相同，它们的复杂度也不相同，不过在这里我不会讲压缩算法的细节，感兴趣的朋友可以自己Google。</p>
<p>在对<code>HTML</code>、<code>CSS</code>和<code>JavaScript</code>这些文件进行压缩之前，还需要先进行一次冗余压缩。<strong>所谓冗余压缩，就是去除多余的字符，例如注释、空格符和换行符</strong>。这些字符对于程序员是有用的，毕竟没有格式化的代码可读性是非常恐怖的，但它们对于浏览器是没有任何意义的，去除这些冗余可以减少文件的数据量。<strong>在进行完冗余压缩之后，再使用压缩算法进一步对数据本身进行压缩</strong>，例如<code>GZIP</code>（<code>GZIP</code>是一个可以作用于任何字节流的通用压缩算法，它会记忆之前已经看到的内容，然后再尝试查找并替换重复的内容。）。</p>
<h4 id="HTTP缓存"><a href="#HTTP缓存" class="headerlink" title="HTTP缓存"></a>HTTP缓存</h4><hr>
<p>通过网络来获取资源通常是缓慢的，如果资源文件过于膨大，浏览器还需要与服务器之间进行多次往返通信才能获得完整的资源文件。缓存可以复用之前获取的资源，既然后端可以使用缓存来减少访问数据库的开销，那前端自然也可以使用缓存来复用资源文件。</p>
<p>浏览器自带了<code>HTTP</code>缓存的功能，只需要确保每个服务器响应的头部都包含了以下的属性：</p>
<p><img src="http://wx3.sinaimg.cn/large/63503acbly1fk62pilgajj20ab07gt8x.jpg"></p>
<ul>
<li><p>ETag： <strong>ETag是一个传递验证令牌，它对资源的更新进行检查，如果资源未发生变化时不会传送任何数据</strong>。当浏览器发送一个请求时，会把ETag一起发送到服务器，服务器会根据当前资源核对令牌（ETag通常是对内容进行<code>Hash</code>后得出的一个指纹），如果资源未发生变化，服务器将返回<code>304 Not Modified</code>响应，这时浏览器不必再次下载资源，而是继续复用缓存。</p>
</li>
<li><p>Cache-Control： <strong>Cache-Control定义了缓存的策略，它规定在什么条件下可以缓存响应以及可以缓存多久</strong>。</p>
<ul>
<li><p>no-cache： no-cache表示必须先与服务器确认返回的响应是否发生了变化，然后才能使用该响应来满足后续对同一网址的请求（每次都会根据ETag对服务器发送请求来确认变化，如果未发生变化，浏览器不会下载资源）。</p>
</li>
<li><p>no-store： no-store直接禁止浏览器以及所有中间缓存存储任何版本的返回响应。简单的说，该策略会禁止任何缓存，每次发送请求时，都会完整地下载服务器的响应。</p>
</li>
<li><p>public&amp;private： 如果响应被标记为public，则即使它有关联的<code>HTTP</code>身份验证，甚至响应状态代码通常无法缓存，浏览器也可以缓存响应。如果响应被标记为private，那么这个响应通常只为单个用户缓存，因此不允许任何中间缓存（CDN）对其进行缓存，private一般用在缓存用户私人信息页面。</p>
</li>
<li><p>max-age： max-age定义了从请求时间开始，缓存的最长时间，单位为秒。</p>
</li>
</ul>
</li>
</ul>
<h4 id="资源预加载"><a href="#资源预加载" class="headerlink" title="资源预加载"></a>资源预加载</h4><hr>
<p><strong><code>Pre-fetching</code>是一种提示浏览器预先加载用户之后可能会使用到的资源的方法。</strong></p>
<p>使用<code>dns-prefetch</code>来提前进行<code>DNS</code>解析，以便之后可以快速地访问另一个主机名（浏览器会在加载网页时对网页中的域名进行解析缓存，这样你在之后的访问时无需进行额外的DNS解析，减少了用户等待时间，提高了页面加载速度）。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">link</span> <span class="attr">rel</span>=<span class="string">&quot;dns-prefetch&quot;</span> <span class="attr">href</span>=<span class="string">&quot;other.hostname.com&quot;</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p>使用<code>prefetch</code>属性可以预先下载资源，不过它的优先级是最低的。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">link</span> <span class="attr">rel</span>=<span class="string">&quot;prefetch&quot;</span>  <span class="attr">href</span>=<span class="string">&quot;/some_other_resource.jpeg&quot;</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><code>Chrome</code>允许使用<code>subresource</code>属性指定优先级最高的下载资源（当所有属性为<code>subresource</code>的资源下载完完毕后，才会开始下载属性为<code>prefetch</code>的资源）。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">link</span> <span class="attr">rel</span>=<span class="string">&quot;subresource&quot;</span>  <span class="attr">href</span>=<span class="string">&quot;/some_other_resource.js&quot;</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><code>prerender</code>可以预先渲染好页面并隐藏起来，之后打开这个页面会跳过渲染阶段直接呈现在用户面前（推荐对用户接下来必须访问的页面进行预渲染，否则得不偿失）。</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">link</span> <span class="attr">rel</span>=<span class="string">&quot;prerender&quot;</span>  <span class="attr">href</span>=<span class="string">&quot;//domain.com/next_page.html&quot;</span>&gt;</span></span><br></pre></td></tr></table></figure>
<h3 id="参考文献"><a href="#参考文献" class="headerlink" title="参考文献"></a>参考文献</h3><hr>
<ul>
<li><p><a target="_blank" rel="noopener" href="https://developers.google.com/web/fundamentals/">Web Fundamentals   |  Google Developers</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="http://www.stevesouders.com/blog/2009/05/18/flushing-the-document-early/">Flushing the Document Early | High Performance Web Sites</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Introduction">Introduction to the DOM - Web APIs | MDN</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://andydavies.me/blog/2013/10/22/how-the-browser-pre-loader-makes-pages-load-faster/">How the Browser Pre-loader Makes Pages Load Faster - Andy Davies</a></p>
</li>
</ul>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/09/20/2017-09-20-PictureSpider/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/09/20/2017-09-20-PictureSpider/" class="post-title-link" itemprop="url">教你如何快速实现一个图片爬虫</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-09-20 18:00:00" itemprop="dateCreated datePublished" datetime="2017-09-20T18:00:00+08:00">2017-09-20</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%90%8E%E7%AB%AF/" itemprop="url" rel="index"><span itemprop="name">后端</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%90%8E%E7%AB%AF/python/" itemprop="url" rel="index"><span itemprop="name">python</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <h3 id="什么是爬虫"><a href="#什么是爬虫" class="headerlink" title="什么是爬虫?"></a>什么是爬虫?</h3><hr>
<p>如果是没有接触过爬虫的人可能会有些许疑惑，爬虫是个什么东西呢？其实爬虫的概念很简单，在互联网时代,万维网已然是大量信息的载体，如何有效地利用并提取这些信息是一个巨大的挑战。<strong>当我们使用浏览器对某个网站发送请求时，服务器会响应<code>HTML</code>文本并由浏览器来进行渲染显示。爬虫正是利用了这一点，通过程序模拟用户的请求，来获得<code>HTML</code>的内容，并从中提取需要的数据和信息</strong>。如果把网络想象成一张蜘蛛网，爬虫程序则像是蜘蛛网上的蜘蛛，不断地爬取数据与信息。</p>
<p>爬虫的概念非常简单易懂，利用<code>python</code>内置的<code>urllib</code>库都可以实现一个简单的爬虫，下面的代码是一个非常简单的爬虫，只要有基本的<code>python</code>知识应该都能看懂。它会收集一个页面中的所有<code>&lt;a&gt;</code>标签(没有做任何规则判断)中的链接，然后顺着这些链接不断地进行深度搜索。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br><span class="line">93</span><br><span class="line">94</span><br><span class="line">95</span><br><span class="line">96</span><br><span class="line">97</span><br><span class="line">98</span><br><span class="line">99</span><br><span class="line">100</span><br><span class="line">101</span><br><span class="line">102</span><br><span class="line">103</span><br><span class="line">104</span><br><span class="line">105</span><br><span class="line">106</span><br><span class="line">107</span><br><span class="line">108</span><br><span class="line">109</span><br><span class="line">110</span><br><span class="line">111</span><br><span class="line">112</span><br><span class="line">113</span><br><span class="line">114</span><br><span class="line">115</span><br><span class="line">116</span><br><span class="line">117</span><br><span class="line">118</span><br><span class="line">119</span><br><span class="line">120</span><br><span class="line">121</span><br><span class="line">122</span><br><span class="line">123</span><br><span class="line">124</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"><span class="keyword">import</span> urllib</span><br><span class="line"><span class="keyword">import</span> os</span><br><span class="line"><span class="keyword">from</span> datetime <span class="keyword">import</span> datetime</span><br><span class="line"></span><br><span class="line"><span class="comment"># 网页的实体类,只含有两个属性,url和标题</span></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Page</span>(<span class="params"><span class="built_in">object</span></span>):</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span>(<span class="params">self,url,title</span>):</span></span><br><span class="line">        self._url = url</span><br><span class="line">        self._title = title</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__str__</span>(<span class="params">self</span>):</span></span><br><span class="line">        <span class="keyword">return</span> <span class="string">&#x27;[Url]: %s [Title]: %s&#x27;</span> %(self._url,self._title)</span><br><span class="line"></span><br><span class="line">    __repr__ = __str__</span><br><span class="line"></span><br><span class="line"><span class="meta">    @property</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">url</span>(<span class="params">self</span>):</span></span><br><span class="line">        <span class="keyword">return</span> self._url</span><br><span class="line"></span><br><span class="line"><span class="meta">    @property</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">title</span>(<span class="params">self</span>):</span></span><br><span class="line">        <span class="keyword">return</span> self._title</span><br><span class="line"></span><br><span class="line"><span class="meta">    @url.setter</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">url</span>(<span class="params">self,value</span>):</span></span><br><span class="line">        <span class="keyword">if</span> <span class="keyword">not</span> <span class="built_in">isinstance</span>(value,<span class="built_in">str</span>):</span><br><span class="line">            <span class="keyword">raise</span> ValueError(<span class="string">&#x27;url must be a string!&#x27;</span>)</span><br><span class="line">        <span class="keyword">if</span> value == <span class="string">&#x27;&#x27;</span>:</span><br><span class="line">            <span class="keyword">raise</span> ValueError(<span class="string">&#x27;url must be not empty!&#x27;</span>)</span><br><span class="line">        self._url = value</span><br><span class="line"></span><br><span class="line"><span class="meta">    @title.setter</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">title</span>(<span class="params">self,value</span>):</span></span><br><span class="line">        <span class="keyword">if</span> <span class="keyword">not</span> <span class="built_in">isinstance</span>(value,<span class="built_in">str</span>):</span><br><span class="line">            <span class="keyword">raise</span> ValueError(<span class="string">&#x27;title must be a string!&#x27;</span>)</span><br><span class="line">        <span class="keyword">if</span> value == <span class="string">&#x27;&#x27;</span>:</span><br><span class="line">            <span class="keyword">raise</span> ValueError(<span class="string">&#x27;title must be not empty!&#x27;</span>)</span><br><span class="line">        self._title = value</span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Spider</span>(<span class="params"><span class="built_in">object</span></span>):</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span>(<span class="params">self,init_page</span>):</span></span><br><span class="line">        self._init_page = init_page <span class="comment"># 种子网页,也就是爬虫的入口</span></span><br><span class="line">        self._pages = []</span><br><span class="line">        self._soup = <span class="literal">None</span> <span class="comment"># BeautifulSoup 一个用来解析HTML的解析器</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">crawl</span>(<span class="params">self</span>):</span></span><br><span class="line">        start_time = datetime.now()</span><br><span class="line">        print(<span class="string">&#x27;[Start Time]: %s&#x27;</span> % start_time)</span><br><span class="line">        start_timestamp = start_time.timestamp()</span><br><span class="line">        tocrawl = [self._init_page] <span class="comment"># 记录将要爬取的网页</span></span><br><span class="line">        crawled = [] <span class="comment"># 记录已经爬取过的网页</span></span><br><span class="line">		<span class="comment"># 不断循环,直到将这张图搜索完毕</span></span><br><span class="line">        <span class="keyword">while</span> tocrawl:</span><br><span class="line">            page = tocrawl.pop()</span><br><span class="line">            <span class="keyword">if</span> page <span class="keyword">not</span> <span class="keyword">in</span> crawled:</span><br><span class="line">                self._init_soup(page)</span><br><span class="line">                self._packaging_to_pages(page)</span><br><span class="line">                links = self._extract_links()</span><br><span class="line">                self._union_list(tocrawl,links)</span><br><span class="line">                crawled.append(page)</span><br><span class="line">        self._write_to_curdir()</span><br><span class="line">        end_time = datetime.now()</span><br><span class="line">        print(<span class="string">&#x27;[End Time]: %s&#x27;</span> % end_time)</span><br><span class="line">        end_timestamp = end_time.timestamp()</span><br><span class="line">        print(<span class="string">&#x27;[Total Time Consuming]: %f.3s&#x27;</span> % (start_timestamp - end_timestamp) / <span class="number">1000</span>)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">_init_soup</span>(<span class="params">self,page</span>):</span></span><br><span class="line">        page_content = <span class="literal">None</span></span><br><span class="line">        <span class="keyword">try</span>:</span><br><span class="line">			<span class="comment"># urllib可以模拟用户请求,获得响应的HTML文本内容</span></span><br><span class="line">            page_content = urllib.request.urlopen(page).read()</span><br><span class="line">        <span class="keyword">except</span>:</span><br><span class="line">            page_content = <span class="string">&#x27;&#x27;</span></span><br><span class="line">		<span class="comment"># 初始化BeautifulSoup,参数二是使用到的解析器名字	</span></span><br><span class="line">        self._soup = BeautifulSoup(page_content,<span class="string">&#x27;lxml&#x27;</span>)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">_extract_links</span>(<span class="params">self</span>):</span></span><br><span class="line">        a_tags = self._soup.find_all(<span class="string">&#x27;a&#x27;</span>) <span class="comment"># 找到所有a标签</span></span><br><span class="line">        links = []</span><br><span class="line">		<span class="comment"># 收集所有a标签中的链接</span></span><br><span class="line">        <span class="keyword">for</span> a_tag <span class="keyword">in</span> a_tags:</span><br><span class="line">            links.append(a_tag.get(<span class="string">&#x27;href&#x27;</span>))</span><br><span class="line">        <span class="keyword">return</span> links</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">_packaging_to_pages</span>(<span class="params">self,page</span>):</span></span><br><span class="line">        title_string = <span class="string">&#x27;&#x27;</span></span><br><span class="line">        <span class="keyword">try</span>:</span><br><span class="line">            title_string = self._soup.title.string <span class="comment"># 获得title标签中的文本内容</span></span><br><span class="line">        <span class="keyword">except</span> AttributeError <span class="keyword">as</span> e :</span><br><span class="line">            print(e)</span><br><span class="line">        page_obj = Page(page,title_string)</span><br><span class="line">        print(page_obj)</span><br><span class="line">        self._pages.append(page_obj)</span><br><span class="line"></span><br><span class="line">	<span class="comment"># 将爬取到的所有信息写入到当前目录下的out.txt文件</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">_write_to_curdir</span>(<span class="params">self</span>):</span></span><br><span class="line">        cur_path = os.path.join(os.path.abspath(<span class="string">&#x27;.&#x27;</span>),<span class="string">&#x27;out.txt&#x27;</span>)</span><br><span class="line">        print(<span class="string">&#x27;Start write to %s&#x27;</span> % cur_path)</span><br><span class="line">        <span class="keyword">with</span> <span class="built_in">open</span>(cur_path,<span class="string">&#x27;w&#x27;</span>) <span class="keyword">as</span> f:</span><br><span class="line">            f.write(self._pages)</span><br><span class="line"></span><br><span class="line">    <span class="comment"># 将dest中的不存在于src的元素合并到src</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">_union_list</span>(<span class="params">self,src,dest</span>):</span></span><br><span class="line">        <span class="keyword">for</span> dest_val <span class="keyword">in</span> dest:</span><br><span class="line">            <span class="keyword">if</span> dest_val <span class="keyword">not</span> <span class="keyword">in</span> src:</span><br><span class="line">                src.append(dest_val)</span><br><span class="line"></span><br><span class="line"><span class="meta">    @property</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">init_page</span>(<span class="params">self</span>):</span></span><br><span class="line">        <span class="keyword">return</span> self._init_page</span><br><span class="line"></span><br><span class="line"><span class="meta">    @property</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">pages</span>(<span class="params">self</span>):</span></span><br><span class="line">        <span class="keyword">return</span> self._pages</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">test</span>():</span></span><br><span class="line">    spider = Spider(<span class="string">&#x27;https://sylvanassun.github.io/&#x27;</span>)</span><br><span class="line">    spider.crawl()</span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">&#x27;__main__&#x27;</span>:</span><br><span class="line">    test()</span><br></pre></td></tr></table></figure>

<p>但是我们如果想要实现一个性能高效的爬虫，那需要的复杂度也会增长，本文旨在快速实现，所以我们需要借助他人实现的爬虫框架来当做脚手架，在这之上来构建我们的图片爬虫(如果有时间的话当然也鼓励自己造轮子啦)。</p>
<blockquote>
<p>本文作者为: <a target="_blank" rel="noopener" href="https://github.com/SylvanasSun">SylvanasSun(sylvanas.sun@gmail.com)</a>.转载请务必将下面这段话置于文章开头处(保留超链接).<br>本文首发自<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun Blog</a>,原文链接: <a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/09/20/2017-09-20-PictureSpider/">https://sylvanassun.github.io/2017/09/20/2017-09-20-PictureSpider/</a></p>
</blockquote>
<h3 id="BeautifulSoup"><a href="#BeautifulSoup" class="headerlink" title="BeautifulSoup"></a>BeautifulSoup</h3><hr>
<p><strong>BeautifulSoup是一个用于从<code>HTML</code>和<code>XML</code>中提取数据的<code>python</code>库</strong>。Beautiful Soup自动将输入文档转换为Unicode编码，输出文档转换为utf-8编码。你不需要考虑编码方式，除非文档没有指定一个编码方式，这时，Beautiful Soup就不能自动识别编码方式了。然后，你仅仅需要说明一下原始编码方式就可以了。</p>
<p>利用好BeautifulSoup可以为我们省去许多编写正则表达式的时间，如果当你需要更精准地进行搜索时，BeautifulSoup也支持使用正则表达式进行查询。</p>
<p>BeautifulSoup3已经停止维护了，现在基本使用的都是BeautifulSoup4，安装BeautifulSoup4很简单，只需要执行以下的命令。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install beautifulsoup4</span><br></pre></td></tr></table></figure>
<p>然后从<code>bs4</code>模块中导入BeautifulSoup对象，并创建这个对象。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"></span><br><span class="line">soup = BeautifulSoup(body,<span class="string">&#x27;lxml&#x27;</span>)</span><br></pre></td></tr></table></figure>
<p>创建BeautifulSoup对象需要传入两个参数,第一个是需要进行解析的<code>HTML</code>内容，第二个参数为解析器的名字(如果不传入这个参数，BeautifulSoup会默认使用<code>python</code>内置的解析器<code>html.parser</code>)。BeautifulSoup支持多种解析器，有<code>lxml</code>、<code>html5lib</code>、<code>html.parser</code>。</p>
<p>第三方解析器需要用户自己安装，本文中使用的是<code>lxml</code>解析器，安装命令如下（它还需要先安装C语言库）。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install lxml</span><br></pre></td></tr></table></figure>
<p>下面以一个例子演示使用BeautifulSoup的基本方式，如果还想了解更多可以去参考<a target="_blank" rel="noopener" href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">BeautifulSoup文档</a>。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"></span><br><span class="line">html = <span class="string">&quot;&quot;&quot;</span></span><br><span class="line"><span class="string">&lt;html&gt;&lt;head&gt;&lt;title&gt;The Dormouse&#x27;s story&lt;/title&gt;&lt;/head&gt;</span></span><br><span class="line"><span class="string">&lt;body&gt;</span></span><br><span class="line"><span class="string">&lt;p class=&quot;title&quot; name=&quot;dromouse&quot;&gt;&lt;b&gt;The Dormouse&#x27;s story&lt;/b&gt;&lt;/p&gt;</span></span><br><span class="line"><span class="string">&lt;p class=&quot;story&quot;&gt;Once upon a time there were three little sisters; and their names were</span></span><br><span class="line"><span class="string">&lt;a href=&quot;http://example.com/elsie&quot; class=&quot;sister&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;,</span></span><br><span class="line"><span class="string">&lt;a href=&quot;http://example.com/lacie&quot; class=&quot;sister&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt; and</span></span><br><span class="line"><span class="string">&lt;a href=&quot;http://example.com/tillie&quot; class=&quot;sister&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;;</span></span><br><span class="line"><span class="string">and they lived at the bottom of a well.&lt;/p&gt;</span></span><br><span class="line"><span class="string">&lt;p class=&quot;story&quot;&gt;...&lt;/p&gt;</span></span><br><span class="line"><span class="string">&quot;&quot;&quot;</span></span><br><span class="line"></span><br><span class="line">soup = BeautifulSoup(html,<span class="string">&#x27;lxml&#x27;</span>)</span><br><span class="line"><span class="comment"># 格式化输出soup中的内容</span></span><br><span class="line">print(soup.prettify())</span><br><span class="line"></span><br><span class="line"><span class="comment"># 可以通过.操作符来访问标签对象</span></span><br><span class="line">title = soup.title</span><br><span class="line">print(title)</span><br><span class="line">p = soup.p</span><br><span class="line">print(p)</span><br><span class="line"></span><br><span class="line"><span class="comment"># 获得title标签中的文本内容,这2个方法得到的结果是一样的</span></span><br><span class="line">print(title.text)</span><br><span class="line">print(title.get_text())</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="comment"># 获得head标签的所有子节点,contents返回的是一个列表,children返回的是一个迭代器</span></span><br><span class="line">head = soup.head</span><br><span class="line">print(head.contents)</span><br><span class="line">print(head.children)</span><br><span class="line"></span><br><span class="line"><span class="comment"># 获得所有a标签,并输出每个a标签href属性中的内容</span></span><br><span class="line">a_tags = soup.find_all(<span class="string">&#x27;a&#x27;</span>)</span><br><span class="line"><span class="keyword">for</span> a_tag <span class="keyword">in</span> a_tags:</span><br><span class="line">    print(a_tag[<span class="string">&#x27;href&#x27;</span>])</span><br><span class="line"><span class="comment"># find函数与find_all一样,只不过返回的是找到的第一个标签    </span></span><br><span class="line">print(soup.find(<span class="string">&#x27;a&#x27;</span>)[<span class="string">&#x27;href&#x27;</span>])</span><br><span class="line"></span><br><span class="line"><span class="comment"># 根据属性查找,这2个方法得到的结果是一样的</span></span><br><span class="line">print(soup.find(<span class="string">&#x27;p&#x27;</span>,class_=<span class="string">&#x27;title&#x27;</span>))</span><br><span class="line">print(soup.find(<span class="string">&#x27;p&#x27;</span>,attrs=&#123;<span class="string">&#x27;class&#x27;</span>: <span class="string">&#x27;title&#x27;</span>&#125;))</span><br><span class="line"></span><br></pre></td></tr></table></figure>

<h3 id="Scrapy"><a href="#Scrapy" class="headerlink" title="Scrapy"></a>Scrapy</h3><hr>
<p><code>Scrapy</code>是一个功能强大的爬虫框架，它已经实现了一个性能高效的爬虫结构，并提供了很多供程序员自定义的配置。使用<code>Scrapy</code>只需要在它的规则上编写我们的爬虫逻辑即可。</p>
<p>首先需要先安装<code>Scrapy</code>,执行命令<code>pip install scrapy</code>。然后再执行命令<code>scrapy startproject 你的项目名</code>来生成<code>Scrapy</code>的基本项目文件夹。生成的项目结构如下。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line">你的项目名&#x2F;</span><br><span class="line">    scrapy.cfg</span><br><span class="line">    你的项目名&#x2F;</span><br><span class="line">        __init__.py</span><br><span class="line">        items.py</span><br><span class="line">        pipelines.py</span><br><span class="line">        settings.py</span><br><span class="line">        spiders&#x2F;</span><br><span class="line">            __init__.py</span><br><span class="line">            ...</span><br></pre></td></tr></table></figure>
<ul>
<li><p><code>scrapy.cfg</code> : 项目的配置文件。</p>
</li>
<li><p><code>items.py</code>：物品模块，用户需要在这个模块中定义数据封装的实体类。</p>
</li>
<li><p><code>pipelines.py</code>：管道模块，用户需要在这个模块中定义处理数据的逻辑（如存储到数据库等）。</p>
</li>
<li><p><code>settings.py</code>：这个模块定义了整个项目中的各种配置变量。</p>
</li>
<li><p><code>spiders/</code>：在这个包中定义用户自己的爬虫模块。</p>
</li>
</ul>
<p>启动<code>Scrapy</code>的爬虫也很简单，只需要执行命令<code>scrapy crawl 你的爬虫名</code>。下面介绍<code>Scrapy</code>中的关键模块的演示案例，如果想要了解有关<code>Scrapy</code>的更多信息，请参考<a target="_blank" rel="noopener" href="https://doc.scrapy.org/en/0.24/intro/tutorial.html">Scrapy官方文档</a>。</p>
<h4 id="items"><a href="#items" class="headerlink" title="items"></a>items</h4><hr>
<p><strong><code>items</code>模块主要是为了将爬取到的非结构化数据封装到一个结构化对象中，自定义的<code>item</code>类必须继承自<code>scrapy.Item</code>，且每个属性都要赋值为<code>scrapy.Field()</code>。</strong></p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> scrapy</span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Product</span>(<span class="params">scrapy.Item</span>):</span></span><br><span class="line">    name = scrapy.Field()</span><br><span class="line">    price = scrapy.Field()</span><br><span class="line">    stock = scrapy.Field()</span><br></pre></td></tr></table></figure>
<p>操作<code>item</code>对象就像操作一个<code>dict</code>对象一样简单。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">product &#x3D; Product()</span><br><span class="line"># 对属性赋值</span><br><span class="line">product[&#39;name&#39;] &#x3D; &#39;Sylvanas&#39;</span><br><span class="line">product[&#39;price&#39;] &#x3D; 998</span><br><span class="line"># 获得属性</span><br><span class="line">print(product[&#39;name&#39;])</span><br><span class="line">print(product[&#39;price&#39;])</span><br></pre></td></tr></table></figure>
<h4 id="pipelines"><a href="#pipelines" class="headerlink" title="pipelines"></a>pipelines</h4><hr>
<p><strong>当一个<code>Item</code>经由爬虫封装之后将会到达<code>Pipeline</code>类，你可以定义自己的<code>Pipeline</code>类来决定将<code>Item</code>的处理策略。</strong></p>
<p>每个<code>Pipeline</code>可以实现以下函数。</p>
<ul>
<li><p><code>process_item(item, spider)</code>： 每个<code>Pipeline</code>都会调用此函数来处理<code>Item</code>，这个函数必须返回一个<code>Item</code>，如果在处理过程中遇见错误，可以抛出<code>DropItem</code>异常。</p>
</li>
<li><p><code>open_spider(spider)</code>： 当<code>spider</code>开始时将会调用此函数，可以利用这个函数进行打开文件等操作。</p>
</li>
<li><p><code>close_spider(spider)</code>：当<code>spider</code>关闭时将会调用此函数，可以利用这个函数对<code>IO</code>资源进行关闭。</p>
</li>
<li><p><code>from_crawler(cls, crawler)</code>： 这个函数用于获取<code>settings.py</code>模块中的属性。注意这个函数是一个类方法。</p>
</li>
</ul>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> scrapy.exceptions <span class="keyword">import</span> DropItem</span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">PricePipeline</span>(<span class="params"><span class="built_in">object</span></span>):</span></span><br><span class="line"></span><br><span class="line">    vat_factor = <span class="number">1.15</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span>(<span class="params">self, HELLO</span>):</span></span><br><span class="line">		self.HELLO = HELLO</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">process_item</span>(<span class="params">self, item, spider</span>):</span></span><br><span class="line">        <span class="keyword">if</span> item[<span class="string">&#x27;price&#x27;</span>]:</span><br><span class="line">            <span class="keyword">if</span> item[<span class="string">&#x27;price_excludes_vat&#x27;</span>]:</span><br><span class="line">                item[<span class="string">&#x27;price&#x27;</span>] = item[<span class="string">&#x27;price&#x27;</span>] * self.vat_factor</span><br><span class="line">            <span class="keyword">return</span> item</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            <span class="keyword">raise</span> DropItem(<span class="string">&quot;Missing price in %s&quot;</span> % item)</span><br><span class="line">			</span><br><span class="line"><span class="meta">    @classmethod</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">from_crawler</span>(<span class="params">cls, crawler</span>):</span></span><br><span class="line">        settings = crawler.settings <span class="comment"># 从crawler中获得settings</span></span><br><span class="line">        <span class="keyword">return</span> cls(settings[<span class="string">&#x27;HELLO&#x27;</span>]) <span class="comment"># 返回settings中的属性，将由__init__函数接收			 </span></span><br></pre></td></tr></table></figure>
<p>当定义完你的<code>Pipeline</code>后，还需要在<code>settings.py</code>中对你的<code>Pipeline</code>进行设置。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">ITEM_PIPELINES = &#123;</span><br><span class="line">	<span class="comment"># 后面跟的数字是优先级别</span></span><br><span class="line">    <span class="string">&#x27;pipeline类的全路径&#x27;</span>: <span class="number">300</span>,</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>

<h4 id="spiders"><a href="#spiders" class="headerlink" title="spiders"></a>spiders</h4><hr>
<p>在<code>spiders</code>模块中，用户可以通过自定义<code>Spider</code>类来制定自己的爬虫逻辑与数据封装策略。**每个<code>Spider</code>都必须继承自<code>class scrapy.spider.Spider</code>**，这是<code>Scrapy</code>中最简单的爬虫基类，它没有什么特殊功能，<code>Scrapy</code>也提供了其他功能不同的<code>Spider</code>类供用户选择，这里就不多叙述了，可以去参考官方文档。</p>
<p>用户可以通过以下属性来自定义配置<code>Spider</code>:</p>
<ul>
<li><p><code>name</code>： 这是<code>Spider</code>的名称，<code>Scrapy</code>需要通过这个属性来定位<code>Spider</code>并启动爬虫，它是唯一且必需的。</p>
</li>
<li><p><code>allowed_domains</code>： 这个属性规定了<code>Spider</code>允许爬取的域名。</p>
</li>
<li><p><code>start_urls</code>： <code>Spider</code>开始时将抓取的网页列表。</p>
</li>
<li><p><code>start_requests()</code>： 该函数是<code>Spider</code>开始抓取时启动的函数，它只会被调用一次，有的网站必须要求用户登录，可以使用这个函数先进行模拟登录。</p>
</li>
<li><p><code>make_requests_from_url(url)</code>： 该函数接收一个<code>url</code>并返回<code>Request</code>对象。除非重写该函数，否则它会默认以<code>parse(response)</code>函数作为回调函数，并启用<code>dont_filter</code>参数（这个参数是用于过滤重复<code>url</code>的）。</p>
</li>
<li><p><code>parse(response)</code>： 当请求没有设置回调函数时，则会默认调用<code>parse(response)</code>。</p>
</li>
<li><p><code>log(message[, level, component])</code>： 用于记录日志。</p>
</li>
<li><p><code>closed(reason)</code>： 当<code>Spider</code>关闭时调用。</p>
</li>
</ul>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> scrapy</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">MySpider</span>(<span class="params">scrapy.Spider</span>):</span></span><br><span class="line">    name = <span class="string">&#x27;example.com&#x27;</span></span><br><span class="line">    allowed_domains = [<span class="string">&#x27;example.com&#x27;</span>]</span><br><span class="line">    start_urls = [</span><br><span class="line">        <span class="string">&#x27;http://www.example.com/1.html&#x27;</span>,</span><br><span class="line">        <span class="string">&#x27;http://www.example.com/2.html&#x27;</span>,</span><br><span class="line">        <span class="string">&#x27;http://www.example.com/3.html&#x27;</span>,</span><br><span class="line">    ]</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">parse</span>(<span class="params">self, response</span>):</span></span><br><span class="line">        self.log(<span class="string">&#x27;A response from %s just arrived!&#x27;</span> % response.url)</span><br></pre></td></tr></table></figure>

<h3 id="其他依赖库"><a href="#其他依赖库" class="headerlink" title="其他依赖库"></a>其他依赖库</h3><hr>
<h4 id="Requests"><a href="#Requests" class="headerlink" title="Requests"></a>Requests</h4><hr>
<p><code>Requests</code>也是一个第三方<code>python</code>库，它比<code>python</code>内置的<code>urllib</code>更加简单好用。只需要安装（<code>pip install requests</code>），然后导包后，即可轻松对网站发起请求。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"></span><br><span class="line"><span class="comment"># 支持http的各种类型请求</span></span><br><span class="line">r = requests.post(<span class="string">&quot;http://httpbin.org/post&quot;</span>)</span><br><span class="line">r = requests.put(<span class="string">&quot;http://httpbin.org/put&quot;</span>)</span><br><span class="line">r = requests.delete(<span class="string">&quot;http://httpbin.org/delete&quot;</span>)</span><br><span class="line">r = requests.head(<span class="string">&quot;http://httpbin.org/get&quot;</span>)</span><br><span class="line">r = requests.options(<span class="string">&quot;http://httpbin.org/get&quot;</span>)</span><br><span class="line"></span><br><span class="line"><span class="comment"># 获得响应内容</span></span><br><span class="line">r.text <span class="comment"># 返回文本</span></span><br><span class="line">r.content <span class="comment"># 返回字节</span></span><br><span class="line">r.raw <span class="comment"># 返回原始内容</span></span><br><span class="line">r.json() <span class="comment"># 返回json</span></span><br></pre></td></tr></table></figure>
<p>关于更多的参数与内容请参考<a target="_blank" rel="noopener" href="http://docs.python-requests.org/zh_CN/latest/user/quickstart.html">Requests文档</a>。</p>
<h4 id="BloomFilter"><a href="#BloomFilter" class="headerlink" title="BloomFilter"></a>BloomFilter</h4><hr>
<p><code>BloomFilter</code>是一个用于过滤重复数据的数据结构，我们可以使用它来对重复的<code>url</code>进行过滤。本文使用的<code>BloomFilter</code>来自于<a target="_blank" rel="noopener" href="https://github.com/jaybaird/python-bloomfilter">python-bloomfilter</a>，其他操作系统用户请使用<code>pip install pybloom</code>命令安装，windows用户请使用<code>pip install pybloom-live</code>（原版对windows不友好）。</p>
<h3 id="分析"><a href="#分析" class="headerlink" title="分析"></a>分析</h3><hr>
<p>介绍了需要的依赖库之后，我们终于可以开始实现自己的图片爬虫了。我们的目标是爬<code>https://www.deviantart.com/</code>网站中的图片，在写爬虫程序之前，还需要先分析一下页面的<code>HTML</code>结构，这样才能针对性地找到图片的源地址。</p>
<p>为了保证爬到的图片的质量，我决定从热门页面开始爬，链接为<code>https://www.deviantart.com/whats-hot/</code>。</p>
<p>打开浏览器的开发者工具后，可以发现每个图片都是由一个<code>a</code>标签组成，每个<code>a</code>标签的<code>class</code>为<code>torpedo-thumb-link</code>，而这个<code>a</code>标签的<code>href</code>正好就是这张图片的详情页面（如果我们从这里就开始爬图片的话，那么爬到的可都只是缩略图）。</p>
<p><img src="http://wx1.sinaimg.cn/large/63503acbly1fjrjc53yfgj20ze0hj4fa.jpg"></p>
<p>进入到详情页后，不要马上爬取当前图片的源地址，因为当前页显示的图片并不是原始格式，我们对图片双击放大之后再使用开发者工具抓到这个图片所在的<code>img</code>标签后，再让爬虫获取这个标签中的源地址。</p>
<p><img src="http://wx3.sinaimg.cn/large/63503acbly1fjrjc67dulj21eg0h2qpr.jpg"></p>
<p>在获得图片的源地址之后，我的策略是让爬虫继续爬取该页中推荐的更多图片，通过开发者工具，可以发现这些图片都被封装在一个<code>class</code>为<code>tt-crop thumb</code>的<code>div</code>标签中，而该标签里的第一个<code>a</code>子标签正好就是这个图片的详情页链接。</p>
<p><img src="http://wx1.sinaimg.cn/large/63503acbly1fjrjc5msszj20sn0fkn80.jpg"></p>
<h3 id="初始配置"><a href="#初始配置" class="headerlink" title="初始配置"></a>初始配置</h3><hr>
<p>在对网页的<code>HTML</code>进行分析之后，可以开始写程序了，首先先用<code>Scrapy</code>的命令来初始化项目。之后在<code>settings.py</code>中做如下配置。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 这个是网络爬虫协议，爬虫访问网站时都会检查是否有robots.txt文件，</span></span><br><span class="line"><span class="comment"># 然后根据文件中的内容选择性地进行爬取，我们这里设置为False即不检查robots.txt</span></span><br><span class="line">ROBOTSTXT_OBEY = <span class="literal">False</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 图片下载的根目录路径</span></span><br><span class="line">IMAGES_STORE = <span class="string">&#x27;.&#x27;</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 图片最大下载数量，当下载的图片达到这个数字时，将会手动关闭爬虫</span></span><br><span class="line">MAXIMUM_IMAGE_NUMBER = <span class="number">10000</span></span><br></pre></td></tr></table></figure>
<p>然后定义我们的<code>Item</code>。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> scrapy</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">DeviantArtSpiderItem</span>(<span class="params">scrapy.Item</span>):</span></span><br><span class="line">    author = scrapy.Field() <span class="comment"># 作者名</span></span><br><span class="line">    image_name = scrapy.Field() <span class="comment"># 图片名</span></span><br><span class="line">    image_id = scrapy.Field() <span class="comment"># 图片id</span></span><br><span class="line">    image_src = scrapy.Field() <span class="comment"># 图片的源地址</span></span><br></pre></td></tr></table></figure>
<p>创建自己的<code>spider</code>模块与<code>Spider</code>类。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"><span class="comment"># this import package is right,if PyCharm give out warning please ignore</span></span><br><span class="line"><span class="keyword">from</span> deviant_art_spider.items <span class="keyword">import</span> DeviantArtSpiderItem</span><br><span class="line"><span class="keyword">from</span> pybloom_live <span class="keyword">import</span> BloomFilter</span><br><span class="line"><span class="keyword">from</span> scrapy.contrib.linkextractors.lxmlhtml <span class="keyword">import</span> LxmlLinkExtractor</span><br><span class="line"><span class="keyword">from</span> scrapy.contrib.spiders <span class="keyword">import</span> CrawlSpider, Rule</span><br><span class="line"><span class="keyword">from</span> scrapy.http <span class="keyword">import</span> Request</span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">DeviantArtImageSpider</span>(<span class="params">CrawlSpider</span>):</span></span><br><span class="line">    name = <span class="string">&#x27;deviant_art_image_spider&#x27;</span></span><br><span class="line"></span><br><span class="line">    <span class="comment"># 我不想让scrapy帮助过滤所以设置为空</span></span><br><span class="line">    allowed_domains = <span class="string">&#x27;&#x27;</span></span><br><span class="line"></span><br><span class="line">    start_urls = [<span class="string">&#x27;https://www.deviantart.com/whats-hot/&#x27;</span>]</span><br><span class="line"></span><br><span class="line">    rules = (</span><br><span class="line">        Rule(LxmlLinkExtractor(</span><br><span class="line">            allow=&#123;<span class="string">&#x27;https://www.deviantart.com/whats-hot/[\?\w+=\d+]*&#x27;</span>, &#125;),</span><br><span class="line">            callback=<span class="string">&#x27;parse_page&#x27;</span>, <span class="comment"># 设置回调函数</span></span><br><span class="line">            follow=<span class="literal">True</span> <span class="comment"># 允许爬虫不断地跟随链接进行爬取</span></span><br><span class="line">        ),</span><br><span class="line">    )</span><br><span class="line"></span><br><span class="line">    headers = &#123;</span><br><span class="line">        <span class="string">&quot;User-Agent&quot;</span>: <span class="string">&quot;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36&quot;</span></span><br><span class="line">                      <span class="string">&quot; (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36&quot;</span>,</span><br><span class="line">        <span class="string">&quot;Referer&quot;</span>: <span class="string">&quot;https://www.deviantart.com/&quot;</span></span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">	<span class="comment"># 初始化BloomFilter</span></span><br><span class="line">    <span class="built_in">filter</span> = BloomFilter(capacity=<span class="number">15000</span>)</span><br></pre></td></tr></table></figure>
<p><code>DeviantArtImageSpider</code>继承自<code>CrawlSpider</code>，该类是<code>Scrapy</code>最常用的<code>Spider</code>类，它通过<code>Rule</code>类来定义爬取链接的规则，上述代码中使用了正则表达式<code>https://www.deviantart.com/whats-hot/[\?\w+=\d+]*</code>，这个正则表达式将访问每一页的热门页面。</p>
<h3 id="解析热门页面"><a href="#解析热门页面" class="headerlink" title="解析热门页面"></a>解析热门页面</h3><hr>
<p>爬虫启动时将会先访问热门页面，请求得到响应之后会调用回调函数，我们需要在这个回调函数中获取上述分析中得到的<code>&lt;a class = &#39;torpedo-thumb-link&#39;&gt;</code>标签，然后抽取出每张图片的详情页链接。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br></pre></td><td class="code"><pre><span class="line">   <span class="function"><span class="keyword">def</span> <span class="title">parse_page</span>(<span class="params">self, response</span>):</span></span><br><span class="line">       soup = self._init_soup(response, <span class="string">&#x27;[PREPARING PARSE PAGE]&#x27;</span>)</span><br><span class="line">       <span class="keyword">if</span> soup <span class="keyword">is</span> <span class="literal">None</span>:</span><br><span class="line">           <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line">	<span class="comment"># 找到所有class为torpedo-thumb-link的a标签	</span></span><br><span class="line">       all_a_tag = soup.find_all(<span class="string">&#x27;a&#x27;</span>, class_=<span class="string">&#x27;torpedo-thumb-link&#x27;</span>)</span><br><span class="line">       <span class="keyword">if</span> all_a_tag <span class="keyword">is</span> <span class="keyword">not</span> <span class="literal">None</span> <span class="keyword">and</span> <span class="built_in">len</span>(all_a_tag) &gt; <span class="number">0</span>:</span><br><span class="line">           <span class="keyword">for</span> a_tag <span class="keyword">in</span> all_a_tag:</span><br><span class="line">			<span class="comment"># 提取图片详情页，然后对详情页链接发起请求，并设置回调函数</span></span><br><span class="line">               detail_link = a_tag[<span class="string">&#x27;href&#x27;</span>]</span><br><span class="line">               request = Request(</span><br><span class="line">                   url=detail_link,</span><br><span class="line">                   headers=self.headers,</span><br><span class="line">                   callback=self.parse_detail_page</span><br><span class="line">               )</span><br><span class="line">			<span class="comment"># 通过request与response对象来传递Item</span></span><br><span class="line">               request.meta[<span class="string">&#x27;item&#x27;</span>] = DeviantArtSpiderItem()</span><br><span class="line">               <span class="keyword">yield</span> request</span><br><span class="line">       <span class="keyword">else</span>:</span><br><span class="line">           self.logger.debug(<span class="string">&#x27;[PARSE FAILED] get &lt;a&gt; tag failed&#x27;</span>)</span><br><span class="line">           <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 初始化BeautifulSoup对象</span></span><br><span class="line">   <span class="function"><span class="keyword">def</span> <span class="title">_init_soup</span>(<span class="params">self, response, log</span>):</span></span><br><span class="line">       url = response.url</span><br><span class="line">       self.headers[<span class="string">&#x27;Referer&#x27;</span>] = url</span><br><span class="line">       self.logger.debug(log + <span class="string">&#x27; &#x27;</span> + url)</span><br><span class="line">       body = requests.get(url, headers=self.headers, timeout=<span class="number">2</span>).content</span><br><span class="line">       soup = BeautifulSoup(body, <span class="string">&#x27;lxml&#x27;</span>)</span><br><span class="line">       <span class="keyword">if</span> soup <span class="keyword">is</span> <span class="literal">None</span>:</span><br><span class="line">           self.logger.debug(<span class="string">&#x27;[PARSE FAILED] read %s body failed&#x27;</span> % url)</span><br><span class="line">           <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line">       <span class="keyword">return</span> soup</span><br></pre></td></tr></table></figure>


<h3 id="解析详情页"><a href="#解析详情页" class="headerlink" title="解析详情页"></a>解析详情页</h3><hr>
<p><code>parse_page()</code>函数会不断地发送请求到详情页链接，解析详情页的回调函数需要处理数据封装到<code>Item</code>，还需要提取详情页中更多图片的详情链接然后发送请求。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br></pre></td><td class="code"><pre><span class="line">   <span class="function"><span class="keyword">def</span> <span class="title">parse_detail_page</span>(<span class="params">self, response</span>):</span></span><br><span class="line">       <span class="keyword">if</span> response.url <span class="keyword">in</span> self.<span class="built_in">filter</span>:</span><br><span class="line">           self.logger.debug(<span class="string">&#x27;[REPETITION] already parse url %s &#x27;</span> % response.url)</span><br><span class="line">           <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line">       soup = self._init_soup(response, <span class="string">&#x27;[PREPARING DETAIL PAGE]&#x27;</span>)</span><br><span class="line">       <span class="keyword">if</span> soup <span class="keyword">is</span> <span class="literal">None</span>:</span><br><span class="line">           <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line">	<span class="comment"># 包装Item并返回	</span></span><br><span class="line">       <span class="keyword">yield</span> self.packing_item(response.meta[<span class="string">&#x27;item&#x27;</span>], soup)</span><br><span class="line">       self.<span class="built_in">filter</span>.add(response.url)</span><br><span class="line">       <span class="comment"># 继续抓取当前页中的其他图片</span></span><br><span class="line">       all_div_tag = soup.find_all(<span class="string">&#x27;div&#x27;</span>, class_=<span class="string">&#x27;tt-crop thumb&#x27;</span>)</span><br><span class="line">       <span class="keyword">if</span> all_div_tag <span class="keyword">is</span> <span class="keyword">not</span> <span class="literal">None</span> <span class="keyword">and</span> <span class="built_in">len</span>(all_div_tag) &gt; <span class="number">0</span>:</span><br><span class="line">           <span class="keyword">for</span> div_tag <span class="keyword">in</span> all_div_tag:</span><br><span class="line">               detail_link = div_tag.find(<span class="string">&#x27;a&#x27;</span>)[<span class="string">&#x27;href&#x27;</span>]</span><br><span class="line">               request = Request(</span><br><span class="line">                   url=detail_link,</span><br><span class="line">                   headers=self.headers,</span><br><span class="line">                   callback=self.parse_detail_page</span><br><span class="line">               )</span><br><span class="line">               request.meta[<span class="string">&#x27;item&#x27;</span>] = DeviantArtSpiderItem()</span><br><span class="line">               <span class="keyword">yield</span> request</span><br><span class="line">       <span class="keyword">else</span>:</span><br><span class="line">           self.logger.debug(<span class="string">&#x27;[PARSE FAILED] get &lt;div&gt; tag failed&#x27;</span>)</span><br><span class="line">           <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 封装数据到Item</span></span><br><span class="line">   <span class="function"><span class="keyword">def</span> <span class="title">packing_item</span>(<span class="params">self, item, soup</span>):</span></span><br><span class="line">       self.logger.debug(<span class="string">&#x27;[PREPARING PACKING ITEM]..........&#x27;</span>)</span><br><span class="line">       img = soup.find(<span class="string">&#x27;img&#x27;</span>, class_=<span class="string">&#x27;dev-content-full&#x27;</span>)</span><br><span class="line">       img_alt = img[<span class="string">&#x27;alt&#x27;</span>] <span class="comment"># alt属性中保存了图片名与作者名</span></span><br><span class="line">       item[<span class="string">&#x27;image_name&#x27;</span>] = img_alt[:img_alt.find(<span class="string">&#x27;by&#x27;</span>) - <span class="number">1</span>]</span><br><span class="line">       item[<span class="string">&#x27;author&#x27;</span>] = img_alt[img_alt.find(<span class="string">&#x27;by&#x27;</span>) + <span class="number">2</span>:]</span><br><span class="line">       item[<span class="string">&#x27;image_id&#x27;</span>] = img[<span class="string">&#x27;data-embed-id&#x27;</span>] <span class="comment"># data-embed-id属性保存了图片id</span></span><br><span class="line">       item[<span class="string">&#x27;image_src&#x27;</span>] = img[<span class="string">&#x27;src&#x27;</span>]</span><br><span class="line">       self.logger.debug(<span class="string">&#x27;[PACKING ITEM FINISHED] %s &#x27;</span> % item)</span><br><span class="line">       <span class="keyword">return</span> item</span><br></pre></td></tr></table></figure>
<h3 id="处理Item"><a href="#处理Item" class="headerlink" title="处理Item"></a>处理Item</h3><hr>
<p>对于<code>Item</code>的处理，只是简单地将图片命名与下载到本地。我没有使用多进程或者多线程，也没有使用<code>Scrapy</code>自带的<code>ImagePipeline</code>（自由度不高），有兴趣的童鞋可以自己选择实现。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">import</span> threading</span><br><span class="line"><span class="keyword">import</span> os</span><br><span class="line"><span class="keyword">from</span> scrapy.exceptions <span class="keyword">import</span> DropItem, CloseSpider</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">DeviantArtSpiderPipeline</span>(<span class="params"><span class="built_in">object</span></span>):</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span>(<span class="params">self, IMAGE_STORE, MAXIMUM_IMAGE_NUMBER</span>):</span></span><br><span class="line">        <span class="keyword">if</span> IMAGE_STORE <span class="keyword">is</span> <span class="literal">None</span> <span class="keyword">or</span> MAXIMUM_IMAGE_NUMBER <span class="keyword">is</span> <span class="literal">None</span>:</span><br><span class="line">            <span class="keyword">raise</span> CloseSpider(<span class="string">&#x27;Pipeline load settings failed&#x27;</span>)</span><br><span class="line">        self.IMAGE_STORE = IMAGE_STORE</span><br><span class="line">        self.MAXIMUM_IMAGE_NUMBER = MAXIMUM_IMAGE_NUMBER</span><br><span class="line">        <span class="comment"># 记录当前下载的图片数量</span></span><br><span class="line">        self.image_max_counter = <span class="number">0</span></span><br><span class="line">        <span class="comment"># 根据图片数量创建文件夹，每1000张在一个文件夹中</span></span><br><span class="line">        self.dir_counter = <span class="number">0</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">process_item</span>(<span class="params">self, item, spider</span>):</span></span><br><span class="line">        <span class="keyword">if</span> item <span class="keyword">is</span> <span class="literal">None</span>:</span><br><span class="line">            <span class="keyword">raise</span> DropItem(<span class="string">&#x27;Item is null&#x27;</span>)</span><br><span class="line">        dir_path = self.make_dir()</span><br><span class="line">		<span class="comment"># 拼接图片名称</span></span><br><span class="line">        image_final_name = item[<span class="string">&#x27;image_name&#x27;</span>] + <span class="string">&#x27;-&#x27;</span> + item[<span class="string">&#x27;image_id&#x27;</span>] + <span class="string">&#x27;-by@&#x27;</span> + item[<span class="string">&#x27;author&#x27;</span>] + <span class="string">&#x27;.jpg&#x27;</span></span><br><span class="line">        dest_path = os.path.join(dir_path, image_final_name)</span><br><span class="line">        self.download_image(item[<span class="string">&#x27;image_src&#x27;</span>], dest_path)</span><br><span class="line">        self.image_max_counter += <span class="number">1</span></span><br><span class="line">        <span class="keyword">if</span> self.image_max_counter &gt;= self.MAXIMUM_IMAGE_NUMBER:</span><br><span class="line">            <span class="keyword">raise</span> CloseSpider(<span class="string">&#x27;Current downloaded image already equal maximum number&#x27;</span>)</span><br><span class="line">        <span class="keyword">return</span> item</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">make_dir</span>(<span class="params">self</span>):</span></span><br><span class="line">        print(<span class="string">&#x27;[IMAGE_CURRENT NUMBER] %d &#x27;</span> % self.image_max_counter)</span><br><span class="line">        <span class="keyword">if</span> self.image_max_counter % <span class="number">1000</span> == <span class="number">0</span>:</span><br><span class="line">            self.dir_counter += <span class="number">1</span></span><br><span class="line">        path = os.path.abspath(self.IMAGE_STORE)</span><br><span class="line">        path = os.path.join(path, <span class="string">&#x27;crawl_images&#x27;</span>)</span><br><span class="line">        path = os.path.join(path, <span class="string">&#x27;dir-&#x27;</span> + <span class="built_in">str</span>(self.dir_counter))</span><br><span class="line">        <span class="keyword">if</span> <span class="keyword">not</span> os.path.exists(path):</span><br><span class="line">            os.makedirs(path)</span><br><span class="line">            print(<span class="string">&#x27;[CREATED DIR] %s &#x27;</span> % path)</span><br><span class="line">        <span class="keyword">return</span> path</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">download_image</span>(<span class="params">self, src, dest</span>):</span></span><br><span class="line">        print(<span class="string">&#x27;[Thread %s] preparing download image.....&#x27;</span> % threading.current_thread().name)</span><br><span class="line">        response = requests.get(src, timeout=<span class="number">2</span>)</span><br><span class="line">        <span class="keyword">if</span> response.status_code == <span class="number">200</span>:</span><br><span class="line">            <span class="keyword">with</span> <span class="built_in">open</span>(dest, <span class="string">&#x27;wb&#x27;</span>) <span class="keyword">as</span> f:</span><br><span class="line">                f.write(response.content)</span><br><span class="line">                print(<span class="string">&#x27;[DOWNLOAD FINISHED] from %s to %s &#x27;</span> % (src, dest))</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            <span class="keyword">raise</span> DropItem(<span class="string">&#x27;[Thread %s] request image src failed status code = %s&#x27;</span></span><br><span class="line">                           % (threading.current_thread().name, response.status_code))</span><br><span class="line"></span><br><span class="line"><span class="meta">    @classmethod</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">from_crawler</span>(<span class="params">cls, crawler</span>):</span></span><br><span class="line">        settings = crawler.settings</span><br><span class="line">        <span class="keyword">return</span> cls(settings[<span class="string">&#x27;IMAGES_STORE&#x27;</span>], settings[<span class="string">&#x27;MAXIMUM_IMAGE_NUMBER&#x27;</span>])</span><br></pre></td></tr></table></figure>
<p>在<code>settings.py</code>中注册该<code>Pipeline</code></p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">ITEM_PIPELINES = &#123;</span><br><span class="line">    <span class="string">&#x27;deviant_art_spider.pipelines.DeviantArtSpiderPipeline&#x27;</span>: <span class="number">300</span>,</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<h3 id="IP代理池"><a href="#IP代理池" class="headerlink" title="IP代理池"></a>IP代理池</h3><hr>
<p>有些网站会有反爬虫机制，为了解决这个问题，每次请求都使用不同的<code>IP</code>代理，有很多网站提供<code>IP</code>代理服务，我们需要写一个爬虫从<a target="_blank" rel="noopener" href="http://www.ip3366.net/free/">云代理</a>中抓取它提供的免费<code>IP</code>代理（免费<code>IP</code>很不稳定，而且我用了代理之后反而各种请求失败了Orz…）。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> os</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">ProxiesSpider</span>(<span class="params"><span class="built_in">object</span></span>):</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span>(<span class="params">self, max_page_number=<span class="number">10</span></span>):</span></span><br><span class="line">        self.seed = <span class="string">&#x27;http://www.ip3366.net/free/&#x27;</span></span><br><span class="line">        self.max_page_number = max_page_number <span class="comment"># 最大页数</span></span><br><span class="line">        self.crawled_proxies = [] <span class="comment"># 爬到的ip,每个元素都是一个dict</span></span><br><span class="line">        self.verified_proxies = [] <span class="comment"># 校验过的ip</span></span><br><span class="line">        self.headers = &#123;</span><br><span class="line">            <span class="string">&#x27;Accept&#x27;</span>: <span class="string">&#x27;*/*&#x27;</span>,</span><br><span class="line">            <span class="string">&#x27;User-Agent&#x27;</span>: <span class="string">&#x27;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)&#x27;</span></span><br><span class="line">                          <span class="string">&#x27; Chrome/45.0.2454.101 Safari/537.36&#x27;</span>,</span><br><span class="line">            <span class="string">&#x27;Accept-Language&#x27;</span>: <span class="string">&#x27;zh-CN,zh;q=0.8&#x27;</span></span><br><span class="line">        &#125;</span><br><span class="line">        self.tocrawl_url = []</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">crawl</span>(<span class="params">self</span>):</span></span><br><span class="line">        self.tocrawl_url.append(self.seed)</span><br><span class="line">        page_counter = <span class="number">1</span></span><br><span class="line">        <span class="keyword">while</span> self.tocrawl_url:</span><br><span class="line">            <span class="keyword">if</span> page_counter &gt; self.max_page_number:</span><br><span class="line">                <span class="keyword">break</span></span><br><span class="line">            url = self.tocrawl_url.pop()</span><br><span class="line">            body = requests.get(url=url, headers=self.headers, params=&#123;<span class="string">&#x27;page&#x27;</span>: page_counter&#125;).content</span><br><span class="line">            soup = BeautifulSoup(body, <span class="string">&#x27;lxml&#x27;</span>)</span><br><span class="line">            <span class="keyword">if</span> soup <span class="keyword">is</span> <span class="literal">None</span>:</span><br><span class="line">                print(<span class="string">&#x27;PARSE PAGE FAILED.......&#x27;</span>)</span><br><span class="line">                <span class="keyword">continue</span></span><br><span class="line">            self.parse_page(soup)</span><br><span class="line">            print(<span class="string">&#x27;Parse page %s done&#x27;</span> % (url + <span class="string">&#x27;?page=&#x27;</span> + <span class="built_in">str</span>(page_counter)))</span><br><span class="line">            page_counter += <span class="number">1</span></span><br><span class="line">            self.tocrawl_url.append(url)</span><br><span class="line">        self.verify_proxies()</span><br><span class="line">        self.download()</span><br><span class="line">	</span><br><span class="line">	<span class="comment"># 解析页面并封装</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">parse_page</span>(<span class="params">self, soup</span>):</span></span><br><span class="line">        table = soup.find(<span class="string">&#x27;table&#x27;</span>, class_=<span class="string">&#x27;table table-bordered table-striped&#x27;</span>)</span><br><span class="line">        tr_list = table.tbody.find_all(<span class="string">&#x27;tr&#x27;</span>)</span><br><span class="line">        <span class="keyword">for</span> tr <span class="keyword">in</span> tr_list:</span><br><span class="line">            ip = tr.contents[<span class="number">1</span>].text</span><br><span class="line">            port = tr.contents[<span class="number">3</span>].text</span><br><span class="line">            protocol = tr.contents[<span class="number">7</span>].text.lower()</span><br><span class="line">            url = protocol + <span class="string">&#x27;://&#x27;</span> + ip + <span class="string">&#x27;:&#x27;</span> + port</span><br><span class="line">            self.crawled_proxies.append(&#123;url: protocol&#125;)</span><br><span class="line">            print(<span class="string">&#x27;Add url %s to crawled_proxies&#x27;</span> % url)</span><br><span class="line"></span><br><span class="line">	<span class="comment"># 对ip进行校验</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">verify_proxies</span>(<span class="params">self</span>):</span></span><br><span class="line">        print(<span class="string">&#x27;Start verify proxies.......&#x27;</span>)</span><br><span class="line">        <span class="keyword">while</span> self.crawled_proxies:</span><br><span class="line">            self.verify_proxy(self.crawled_proxies.pop())</span><br><span class="line">        print(<span class="string">&#x27;Verify proxies done.....&#x27;</span>)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">verify_proxy</span>(<span class="params">self, proxy</span>):</span></span><br><span class="line">        proxies = &#123;&#125;</span><br><span class="line">        <span class="keyword">for</span> key <span class="keyword">in</span> proxy:</span><br><span class="line">            proxies[<span class="built_in">str</span>(proxy[key])] = key <span class="comment"># requests的proxies的格式必须为 协议 : 地址</span></span><br><span class="line">        <span class="keyword">try</span>:</span><br><span class="line">            <span class="keyword">if</span> requests.get(<span class="string">&#x27;https://www.deviantart.com/&#x27;</span>, proxies=proxies, timeout=<span class="number">2</span>).status_code == <span class="number">200</span>:</span><br><span class="line">                print(<span class="string">&#x27;verify proxy success %s &#x27;</span> % proxies)</span><br><span class="line">                self.verified_proxies.append(proxy)</span><br><span class="line">        <span class="keyword">except</span>:</span><br><span class="line">            print(<span class="string">&#x27;verify proxy fail %s &#x27;</span> % proxies)</span><br><span class="line">	</span><br><span class="line">	<span class="comment"># 保存到文件中</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">download</span>(<span class="params">self</span>):</span></span><br><span class="line">        current_path = os.getcwd()</span><br><span class="line">        parent_path = os.path.dirname(current_path)</span><br><span class="line">        <span class="keyword">with</span> <span class="built_in">open</span>(parent_path + <span class="string">&#x27;\proxies.txt&#x27;</span>, <span class="string">&#x27;w&#x27;</span>) <span class="keyword">as</span> f:</span><br><span class="line">            <span class="keyword">for</span> proxy <span class="keyword">in</span> self.verified_proxies:</span><br><span class="line">                <span class="keyword">for</span> key <span class="keyword">in</span> proxy.keys():</span><br><span class="line">                    f.write(key + <span class="string">&#x27;\n&#x27;</span>)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">&#x27;__main__&#x27;</span>:</span><br><span class="line">    spider = ProxiesSpider()</span><br><span class="line">    spider.crawl()</span><br></pre></td></tr></table></figure>
<p>得到了<code>IP</code>代理池之后，还要在<code>Scrapy</code>的<code>middlewares.py</code>模块定义代理中间件类。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> time</span><br><span class="line"><span class="keyword">from</span> scrapy <span class="keyword">import</span> signals</span><br><span class="line"><span class="keyword">import</span> os</span><br><span class="line"><span class="keyword">import</span> random</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">ProxyMiddleware</span>(<span class="params"><span class="built_in">object</span></span>):</span></span><br><span class="line">	<span class="comment"># 每次请求前从IP代理池中选择一个IP代理并进行设置</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">process_request</span>(<span class="params">self, request, spider</span>):</span></span><br><span class="line">        proxy = self.get_proxy(self.make_path())</span><br><span class="line">        print(<span class="string">&#x27;Acquire proxy %s &#x27;</span> % proxy)</span><br><span class="line">        request.meta[<span class="string">&#x27;proxy&#x27;</span>] = proxy</span><br><span class="line">	</span><br><span class="line">	<span class="comment"># 请求失败，重新设置IP代理</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">process_response</span>(<span class="params">self, request, response, spider</span>):</span></span><br><span class="line">        <span class="keyword">if</span> response.status != <span class="number">200</span>:</span><br><span class="line">            proxy = self.get_proxy(self.make_path())</span><br><span class="line">            print(<span class="string">&#x27;Response status code is not 200,try reset request proxy %s &#x27;</span> % proxy)</span><br><span class="line">            request.meta[<span class="string">&#x27;proxy&#x27;</span>] = proxy</span><br><span class="line">            <span class="keyword">return</span> request</span><br><span class="line">        <span class="keyword">return</span> response</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">make_path</span>(<span class="params">self</span>):</span></span><br><span class="line">        current = os.path.abspath(<span class="string">&#x27;.&#x27;</span>)</span><br><span class="line">        parent = os.path.dirname(current)</span><br><span class="line">        <span class="keyword">return</span> os.path.dirname(parent) + <span class="string">&#x27;\proxies.txt&#x27;</span></span><br><span class="line">	</span><br><span class="line">	<span class="comment"># 从IP代理文件中随机获得一个IP代理地址</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">get_proxy</span>(<span class="params">self, path</span>):</span></span><br><span class="line">        <span class="keyword">if</span> <span class="keyword">not</span> os.path.isfile(path):</span><br><span class="line">            print(<span class="string">&#x27;[LOADING PROXY] loading proxies failed proxies file is not exist&#x27;</span>)</span><br><span class="line">        <span class="keyword">while</span> <span class="literal">True</span>:</span><br><span class="line">            <span class="keyword">with</span> <span class="built_in">open</span>(path, <span class="string">&#x27;r&#x27;</span>) <span class="keyword">as</span> f:</span><br><span class="line">                proxies = f.readlines()</span><br><span class="line">            <span class="keyword">if</span> proxies:</span><br><span class="line">                <span class="keyword">break</span></span><br><span class="line">            <span class="keyword">else</span>:</span><br><span class="line">                time.sleep(<span class="number">1</span>)</span><br><span class="line">        <span class="keyword">return</span> random.choice(proxies).strip()</span><br></pre></td></tr></table></figure>
<p>最后在<code>settings.py</code>中进行注册。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">DOWNLOADER_MIDDLEWARES = &#123;</span><br><span class="line">	<span class="comment"># 这个中间件是由scrapy提供的，并且它是必需的</span></span><br><span class="line">    <span class="string">&#x27;scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware&#x27;</span>: <span class="number">543</span>, </span><br><span class="line">	<span class="comment"># 我们自定义的代理中间件</span></span><br><span class="line">    <span class="string">&#x27;deviant_art_spider.middlewares.ProxyMiddleware&#x27;</span>: <span class="number">540</span> </span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<h3 id="End"><a href="#End" class="headerlink" title="End"></a>End</h3><hr>
<p>我们的图片爬虫已经完成了，执行命令<code>scrapy crawl deviant_art_image_spider</code>，然后尽情搜集图片吧！</p>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fjrmgmpigfg214o0ihe2b.gif"></p>
<p><strong><a target="_blank" rel="noopener" href="https://github.com/SylvanasSun/scrapy-picture-spider">想要获得本文中的完整源代码与P站爬虫请点我，顺便求个star…</a></strong></p>
<blockquote>
<p>最近心血来潮想要写爬虫，所以花了点时间过了一遍<code>python</code>语法便匆匆上手了，代码写的有点丑也不够pythonic，各位看官求请吐槽。</p>
</blockquote>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/09/08/2017-09-08-ComputerStructure/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/09/08/2017-09-08-ComputerStructure/" class="post-title-link" itemprop="url">探索计算机的结构与核心概念</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-09-08 18:00:00" itemprop="dateCreated datePublished" datetime="2017-09-08T18:00:00+08:00">2017-09-08</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E8%AE%A1%E7%AE%97%E6%9C%BA/" itemprop="url" rel="index"><span itemprop="name">计算机</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <p>在我们的生活与工作中所使用到的计算机都是基于冯诺依曼结构实现的,冯诺依曼结构又称冯诺依曼模型或普林斯顿结构,它是一种将程序指令存储器和数据存储器合并在一起的计算机设计概念结构.</p>
<p>冯诺依曼结构起源于<code>EDVAC(Electronic Discrete Variable Automatic Computer)</code>离散变量自动电子计算机,当时冯诺依曼以技术顾问的身份加入<code>EDVAC</code>项目组,负责总结和详细说明<code>EDVAC</code>的逻辑设计,直到1945年6月发表了一份长达101页的报告,这就是计算机史上著名的”101页报告”,该报告明确规定<strong>用二进制替代十进制运算</strong>,并<strong>将计算机分成五大组件</strong>,这一卓越的思想为电子计算机的逻辑结构设计奠定了基础,已成为计算机设计的基本原则.</p>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Von_Neumann_architecture.svg/420px-Von_Neumann_architecture.svg.png" alt="冯诺依曼结构"></p>
<p>冯诺依曼结构具有以下特点: </p>
<ol>
<li><p>数据由一个贯穿整个结构的总线来进行传输.</p>
</li>
<li><p>存储器是按地址访问、线性编址的空间</p>
</li>
<li><p>指令由操作码和地址码组成</p>
</li>
<li><p>数据以二进制编码</p>
</li>
<li><p>一个冯诺依曼结构的计算机必须有存储器,控制单元,运算单元,输入输出设备.</p>
</li>
</ol>
<p>冯诺依曼结构将<code>CPU</code>与存储器分开的做法也并非十全十美,<strong><code>CPU</code>和内存、硬盘等设备的数据传输速度不匹配</strong>成了整体效率的瓶颈,<code>CPU</code>会在等待数据输入的时间中空置,许多技术都是为了解决这个瓶颈,例如<code>DMA(直接内存访问)</code>,在<code>CPU</code>中建立高速缓冲区等.</p>
<blockquote>
<p>本文作者为: <a target="_blank" rel="noopener" href="https://github.com/SylvanasSun">SylvanasSun(sylvanas.sun@gmail.com)</a>.转载请务必将下面这段话置于文章开头处(保留超链接).<br>本文首发自<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun Blog</a>,原文链接: <a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/09/08/2017-09-08-ComputerStructure/">https://sylvanassun.github.io/2017/09/08/2017-09-08-ComputerStructure/</a></p>
</blockquote>
<h3 id="现代计算机结构"><a href="#现代计算机结构" class="headerlink" title="现代计算机结构"></a>现代计算机结构</h3><hr>
<p><strong>现代计算机是基于冯诺依曼结构的电子计算机</strong>.所谓电子计算机,就是是一种利用电子学原理,根据一系列指令对数据进行处理的机器.</p>
<p>晶体管是组成现代电子计算机的最原始的部件(集成电路中含有数以亿计的晶体管),它是一种半导体材料(导电性可受控制,范围可从绝缘体至导体之间),晶体管可以通过电流的变化,实现电路的切换,这种特性非常适合组成各种逻辑门(与或非)与表示二进制数据.值得一提的是,早期使用继电器实现逻辑门的计算机体积甚至大到要一整个屋子才能放下.</p>
<p>现代计算机的硬件结构如下图,虽然多了很多其他的硬件但与冯诺依曼结构的概念是一致的: </p>
<p><img src="http://wx4.sinaimg.cn/large/63503acbly1fius74iz98j212e0po778.jpg"></p>
<h3 id="总线"><a href="#总线" class="headerlink" title="总线"></a>总线</h3><hr>
<p><strong>总线是一组贯穿所有硬件结构的电子管道,它携带数据并负责在各个部件间交互传递</strong>.总线传送的数据通常为一个定长的字节块,这个字节块的长度即是总线的位宽,总线位宽越大,数据传输的性能就越高,在32位机器中总线位宽为4个字节,64位机器中为8个字节.</p>
<p>有意思的是总线的英文单词是<code>bus</code>,如果把主板想象成一座城市,那么总线就像是城市中的公共汽车,它按着多种固定线路不停地来回传输数据.</p>
<h3 id="I-O设备"><a href="#I-O设备" class="headerlink" title="I/O设备"></a>I/O设备</h3><hr>
<p><strong><code>I/O(输入/输出)</code>设备是计算机与外部进行联系的桥梁,每个<code>I/O</code>设备都要通过一个控制器或者适配器来与<code>I/O</code>总线相连</strong>.</p>
<p>控制器与适配器的区别只在于它们的封装方式,它们的功能都是为了让<code>I/O</code>设备与<code>I/O</code>总线进行连接: </p>
<ul>
<li><p>控制器是<code>I/O</code>设备本身或者主板上自带的芯片组</p>
</li>
<li><p><code>适配器</code>则是插在主板上的外部设备,</p>
</li>
</ul>
<p>在图中,<code>I/O</code>设备包含鼠标、键盘(输入设备)、显示器(输出设备)、磁盘、网络.</p>
<h3 id="内存"><a href="#内存" class="headerlink" title="内存"></a>内存</h3><hr>
<p><strong>内存也叫主存,它是一个临时的存储设备,存储了运行时的数据(程序与程序处理的数据),以供CPU进行处理</strong>.内存是由一组<code>DRAM</code>(动态随机存取存储器)芯片组成的,<code>DRAM</code>是<code>RAM</code>(随机存取存储器)的一种,另一种为<code>SRAM</code>(静态随机存取存储器),<code>SRAM</code>比<code>DRAM</code>速度更快,但造价也更贵,通常用来实现为高速缓存区.</p>
<p><strong>32位操作系统中的<code>CPU</code>的最大寻址空间只有<code>2^32</code>字节</strong>,换算下来最高内存上限为4GB,但由于<code>CPU</code>还要对<code>BIOS</code>和其他硬件等进行寻址(这些优先级更高),所以用户实际可用的内存只有3GB左右.</p>
<p>64位操作系统的<code>CPU</code>最大寻址空间足足有<code>2^64</code>字节,也就是16EB(1024GB等于1TB,1024TB等于1PB,1024PB等于1EB),这已经是一个无法想象的数字了,不过这也不一定是够用的,毕竟谁又能知道未来的数据量会有多庞大呢?</p>
<p>内存具有以下特点: </p>
<ul>
<li>随机存取: 当存储器中的数据被写入或读取时,所需要的时间与数据所在的位置无关(从逻辑上,可以把内存看成一个线性的字节数组,每个字节都有其唯一的地址(索引),这些地址是从零开始的).</li>
</ul>
<ul>
<li>易失性: 如果电源突然断开,<code>RAM</code>中的数据就会全部丢失(磁盘可以将数据持久化地永久保存下来,就算断电也不会丢失数据).</li>
</ul>
<ul>
<li>依赖刷新: <code>RAM</code>使用电容器来存储数据,当电容器充满电之后表示<code>1</code>,未充电则表示<code>0</code>.由于电容器或多或少有漏电的情形,若不作特别处理,电荷会渐渐随时间流失而使数据发生错误.刷新是指重新为电容器充电,弥补流失了的电荷.<code>DRAM</code>的读取即有刷新的功效,但一般的定时刷新并不需要作完整的读取,只需作该芯片的一个列选择,整列的数据即可获得刷新,而同一时间内,所有相关记忆芯片均可同时作同一列选择,因此,在一段期间内逐一做完所有列的刷新,即可完成所有存储器的刷新.需要刷新正好解释了随机存取存储器的易失性.</li>
</ul>
<ul>
<li>对静电敏感: <code>RAM</code>与集成电路一样,对环境的静电荷非常敏感,静电会干扰存储器内电容器的电荷,导致数据流失,甚至烧坏电路.</li>
</ul>
<h3 id="CPU"><a href="#CPU" class="headerlink" title="CPU"></a>CPU</h3><hr>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/4/46/Intel_Core_I7-920_Boxed_-_14.JPG" alt="Intel I7 CPU"></p>
<p><code>Central Processing Unit</code>中央处理单元,简称<code>CPU</code>或处理器,<code>CPU</code>包含了冯诺依曼结构中的控制器与运算器,它是<strong>解释或执行存储在内存中的指令</strong>的引擎.<code>CPU</code>好比计算机的大脑,从通电开始,直到断电,<code>CPU</code>一直在不断地执行内存中存储的指令.如果没有<code>CPU</code>,那么计算机就会是一台不会动的死机器了.</p>
<p>所谓<strong>指令就是进行指定操作的操作码</strong>,而<strong>指令集架构就是这些操作码的集合</strong>,至于<strong>微架构是一套用于执行指令集的微处理器设计方法,多个不同微架构的<code>CPU</code>可以使用同一套指令集</strong>,一些常见的指令如下: </p>
<ul>
<li>加载: 从内存中复制数据(多少个字节取决于总线位宽)到寄存器,以覆盖寄存器中原来的内容.</li>
</ul>
<ul>
<li>存储: 从寄存器复制数据到内存中的某个位置,以覆盖这个位置上原有的内容.</li>
</ul>
<ul>
<li>操作: 把两个在寄存器中的数据复制到<code>ALU</code>,<code>ALU</code>对这2个数据进行算术运算,并将结果存放到一个寄存器中,以覆盖该寄存器中原有的内容.</li>
</ul>
<ul>
<li>跳转: 从指令本身中抽取数据(地址),将它复制到程序计数器中,以覆盖程序计数器原有的内容.</li>
</ul>
<p>下面以一个简单的算术问题<code>1 + 1</code>来大致了解一下<code>CPU</code>的工作流程: </p>
<ol>
<li><p>这两个变量首先会被存储在内存中.</p>
</li>
<li><p><code>CPU</code>从内存中读取指令并刷新程序计数器(每执行完一个指令都要刷新程序计数器).</p>
</li>
<li><p><code>CPU</code>执行加载指令,通过总线将这两个变量传输(复制)到寄存器.</p>
</li>
<li><p><code>CPU</code>执行运算指令,从寄存器中复制这两个变量进行算术运算,并将结果存到寄存器.</p>
</li>
<li><p><code>CPU</code>执行存储指令,寄存器通过总线将结果存储回内存(覆盖原有位置).</p>
</li>
</ol>
<h4 id="寄存器"><a href="#寄存器" class="headerlink" title="寄存器"></a>寄存器</h4><hr>
<p>寄存器是<code>CPU</code>中的一个存储部件,可以认为它是<strong>容量很小但速度飞快的内存</strong>,寄存器是与<code>ALU</code>直接交互的存储设备(不管数据是在内存还是高速缓冲区,<strong>最终都要存到寄存器才能与<code>ALU</code>交互</strong>).</p>
<p>在<code>CPU</code>架构中,拥有多个寄存器,它们分别拥有各自的用途(指令寄存器,整数寄存器,浮点数寄存器等),且<strong>寄存器的数量和它的大小都与指令集架构和机器支持的位宽相关联</strong>(例如<code>x86-64</code>指令集架构(64位指令集架构)中支持64位的通用寄存器与64位整数运算,而<code>x86</code>指令集架构只能支持32位和16位).</p>
<h4 id="程序计数器"><a href="#程序计数器" class="headerlink" title="程序计数器"></a>程序计数器</h4><hr>
<p>程序计数器用于指示将要执行的指令序列,并且不断刷新指向新的指令地址,根据<code>CPU</code>的实现不同,程序计数器可能会指向正在运行的指令地址也可能会是下一个指令的地址.</p>
<h4 id="高速缓冲"><a href="#高速缓冲" class="headerlink" title="高速缓冲"></a>高速缓冲</h4><hr>
<p>由于寄存器与内存的速度相差过大,为了避免性能上的浪费,在寄存器与内存之间建立数据的缓存区是很有必要的.</p>
<p>高速缓存是一个比内存更小但更快的存储设备,且使用<code>SRAM</code>实现,现在的<code>CPU</code>一般都配有三级缓存,<code>L1</code>缓存速度最快但存储的容量也最小,<code>L2</code>要比<code>L1</code>慢但存储的容量也更大,以此类推(<strong>上一层的存储器作为下一层存储器的高速缓存</strong>,也就是说,寄存器就是<code>L1</code>的高速缓存,<code>L1</code>则是<code>L2</code>的高速缓存,<code>L2</code>是<code>L3</code>的高速缓存…)….</p>
<p>当<code>CPU</code>发起向内存加载数据的请求时,会先从缓存中查找,如果缓存未命中,才会从内存加载数据,并更新缓存.高速缓存之所以如此有效,主要是利用了<strong>局部性原理,即最近访问过的内存位置以及周边的内存位置很容易会被再次访问</strong>.而高速缓存中就存储着这些经常会被访问的数据.</p>
<p><img src="http://wx1.sinaimg.cn/large/63503acbly1fj1nimvcrvj20ti0gvdjo.jpg"></p>
<h4 id="DMA"><a href="#DMA" class="headerlink" title="DMA"></a>DMA</h4><hr>
<p><code>DMA</code>全称为<code>Direct Memory Access</code>直接内存访问,它<strong>允许其他硬件可以直接访问内存中的数据,而无需让<code>CPU</code>介入处理</strong>.一般会使用到<code>DMA</code>的硬件有显卡、网卡、声卡等.</p>
<p><code>DMA</code>会导致发生缓存不一致的问题,需要额外的进行同步操作保证数据安全.例如,当<code>CPU</code>从内存中读取数据后,会暂时将新数据写入缓存中,但还没有将数据更新回内存,如果在这期间发生了<code>DMA</code>,就会读取到旧的数据.</p>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/4/49/Cache_incoherence_write.svg" alt="缓存一致性问题"></p>
<h4 id="流水线"><a href="#流水线" class="headerlink" title="流水线"></a>流水线</h4><hr>
<p>流水线又称管线,是现代<code>CPU</code>中必不可少的优化技术,它<strong>将指令的处理过程拆分为多个步骤,并通过多个硬件处理单元并行执行这些步骤.</strong></p>
<p>管线的具体执行过程很像工厂中的流水线(指令就像在流水线传送带上的产品,各个硬件处理单元就像是在流水线旁进行操作的工人),因此而得名为流水线.</p>
<p>流水线虽然提高了整体的吞吐量,但也是有其缺点的,这是由于流水线依赖于分支预测,如果<code>CPU</code>预测的分支是错误的,那么整个流水线上的所有指令都要取消,然后重新向流水线填充指令,这项操作是很耗费性能的.</p>
<h4 id="超线程"><a href="#超线程" class="headerlink" title="超线程"></a>超线程</h4><hr>
<p><strong>超线程是一种允许一个<code>CPU</code>执行多个控制流的技术</strong>,它复制了<code>CPU</code>中必要的硬件资源(程序计数器、寄存器),来让其在同一时间内处理两个线程的工作.</p>
<p>通过超线程技术,可以让一个<code>CPU</code>核心去执行两个线程,所以一个带有4核(实体核心)的<code>CPU</code>实际上可以执行8个线程(逻辑线程).</p>
<h4 id="多核"><a href="#多核" class="headerlink" title="多核"></a>多核</h4><hr>
<p><strong>多核<code>CPU</code>是指将多个核心(也就是<code>CPU</code>)集成到一个集成电路芯片上.每个核心都可以独立的执行指令,也就是真正意义上的并行执行.</strong></p>
<p>每个核心都拥有独立的寄存器,程序计数器,高速缓存等组件,一般还会有一个所有核心共享的缓存,它是直接与内存连通的缓冲区.</p>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/e/ec/Dual_Core_Generic.svg"></p>
<p>多核<code>CPU</code>与多处理器不同,多处理器是将多个<code>CPU</code>封装在多个独立的集成电路芯片中,而多核<code>CPU</code>是所有核心都封装在同一个集成电路芯片中.</p>
<h3 id="操作系统"><a href="#操作系统" class="headerlink" title="操作系统"></a>操作系统</h3><hr>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/e/e1/Operating_system_placement.svg"></p>
<p><strong>操作系统是用于管理计算机硬件与软件的程序,可以把操作系统看成是应用程序与硬件之间插入的一层软件</strong>,所有应用程序对硬件的操作尝试都必须通过操作系统.</p>
<p>操作系统需要负责管理与配置内存、调度系统资源的优先次序、管理进程与线程、控制I/O设备、操作网络与管理文件系统等事务.可以说操作系统是整个计算机系统中的灵魂所在.</p>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/45/Linux_kernel_System_Call_Interface_and_glibc.svg/800px-Linux_kernel_System_Call_Interface_and_glibc.svg.png" alt="System Call"></p>
<p>操作系统的内核是操作系统最核心的地方,它是代码和数据的一个集合.当应用程序需要操作系统的某些操作时,会执行一条系统调用(<code>system call</code>)指令,这时,控制权会被移交到内核,由内核执行被请求的操作并返回到应用程序.大多数系统的交互式操作都需要在内核完成,例如<code>I/O</code>、进程管理等.</p>
<h3 id="虚拟内存"><a href="#虚拟内存" class="headerlink" title="虚拟内存"></a>虚拟内存</h3><hr>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Virtual_memory.svg/620px-Virtual_memory.svg.png"></p>
<p>虚拟内存是计算机系统内存管理的一种技术,<strong>它为每个进程提供了一个假象,即每个进程都在独占地使用内存(一个连续的地址空间)</strong>,而实际上,它通常被分割为多个物理内存碎片,还有部分暂时存储在磁盘存储器上,在需要时进行数据交换.使用虚拟内存会使程序的编写更加容易,对真实的物理内存的使用也会更加有效率.</p>
<p><img src="http://wx3.sinaimg.cn/large/63503acbly1fj7caakdj7j20bs0eigm6.jpg" alt="进程的虚拟地址空间"></p>
<p>每个进程所能看到的虚拟地址空间大致如上图所示,每个区域都有它专门的作用.</p>
<ul>
<li>内核虚拟内存: 这个区域是为操作系统内核保留的,它不允许应用程序读写这个区域的内容或者直接调用内核代码定义的函数(只有操作系统内核才有权限).</li>
</ul>
<ul>
<li>共享库: 以c语音为例,共享库是用来存放的是像C标准库这样的共享库的代码和数据的区域.</li>
</ul>
<ul>
<li>程序代码和数据: 对于所有进程来说,代码都是从同一固定地址开始,紧接着的是其相对应的数据位置.这片区域就是用来存放代码和数据的.</li>
</ul>
<ul>
<li>堆: <strong>堆内存是指应用程序在运行时进行分配的内存区域,堆可以在运行时动态地扩展和收缩</strong>.像<code>malloc()</code>和<code>free()</code>这样的函数就是在堆内存中进行分配空间与释放,而类似<code>Java</code>这种更高一级的语言提供了自动内存管理和垃圾回收,不需要程序员手动地分配与释放堆内存空间.</li>
</ul>
<ul>
<li>栈: 栈同样也是可以动态地扩展和收缩,它是一个后进先出的容器,<strong>主要用于函数调用</strong>.当一个函数调用时会在栈中分配空间,当调用结束时,这个函数所占用的内存空间会一起释放,无需程序员关心.</li>
</ul>
<h3 id="进程与线程"><a href="#进程与线程" class="headerlink" title="进程与线程"></a>进程与线程</h3><hr>
<h4 id="进程"><a href="#进程" class="headerlink" title="进程"></a>进程</h4><hr>
<p><strong>进程是操作系统对一个正在运行的程序的一种抽象,它是程序的执行实体,是操作系统对资源进行调度的一个基本单位,同时也是线程的容器.</strong></p>
<p>进程跟虚拟内存一样,也是操作系统提供的一种假象,它让每个程序看上去都是在独占地使用<code>CPU</code>、内存和<code>I/O</code>设备.但其实<strong>同一时间只有一个进程在运行</strong>,而我们能够边听歌边上网边码代码的原因其实是操作系统在对进程进行切换,一个进程和另一个进程其实是交错执行的,只不过计算机的速度极快,我们无法感受到而已.</p>
<p>操作系统会保持跟踪进程运行所需的所有状态信息,这种状态,被称为上下文(<code>Context</code>),它包含了许多重要的信息,例如程序计数器和寄存器的当前值等.<strong>当操作系统需要对当前进程进行切换时(转移到另一个进程),会保存当前进程的上下文,然后恢复新进程的上下文</strong>,这时控制权会移交到新进程,新进程会从它上次停下来的地方开始执行,这个过程叫做上下文切换.</p>
<p><strong>操作系统的进程空间可以分为用户空间与内核空间</strong>,也就是用户态与内核态.它们的执行权限不同,一般的应用程序是在用户态中运行的,而当应用程序执行系统调用时就需要切换到内核态,由内核执行.</p>
<h4 id="线程"><a href="#线程" class="headerlink" title="线程"></a>线程</h4><hr>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/a/a5/Multithreaded_process.svg"></p>
<p><strong>线程是操作系统所能调度的最小单位,它被包含在进程之中,且一个进程中的所有线程共享进程的资源,一个线程一般被指为进程中的一条单一顺序的控制流.</strong></p>
<p>线程都运行在进程的上下文中,虽然线程共享了进程的资源,但<strong>每条线程都拥有自己的独立空间</strong>,例如函数调用栈、寄存器、线程本地存储.</p>
<p>线程的实现主要有以下三种方式:</p>
<ul>
<li><p>使用内核线程实现: 内核线程就是由操作系统内核直接支持的线程,这种线程由内核来完成线程切换调度,内核通过调度器对线程进行调度,并将线程的任务映射到各个处理器上.<strong>应用程序一般不会直接使用内核线程,而是使用内核线程的一个接口: 轻量级进程,每个轻量级进程都由一个内核线程支持,所以它们的关系是1:1的</strong>.这种线程的实现方式的缺点也很明显,应用程序想要进行任何线程操作都需要进行系统调用,应用程序会在用户态和内核态之间来回切换,消耗的性能资源较多.</p>
</li>
<li><p>使用用户线程实现: <strong>这种方式将线程完全实现在用户空间中,相关的线程操作都在用户态中完成</strong>,这样可以避免切换到内核态,提高了性能.但正因为没有借助系统调用,操作系统只负责对进程分配资源,这些复杂的线程操作与线程调度都需要由用户线程自己处理实现,提高了程序的复杂性.这种实现方式下,一个进程对应多个用户线程,它们是1:N的关系.</p>
</li>
<li><p>混合实现: 这是一种将内核线程与用户线程一起使用的实现方式.在这种实现下,即存在用户线程,也存在轻量级进程.<strong>用户线程依旧是在用户空间中建立的(相关的线程操作也都是在用户空间中),但使用了轻量级进程来当作用户线程与内核线程之间的桥梁,让内核线程提供线程调度和对处理器的映射</strong>.这种实现方式下,用户线程与轻量级进程的数量比例是不定的,它们是N:M的关系.</p>
</li>
</ul>
<h3 id="文件"><a href="#文件" class="headerlink" title="文件"></a>文件</h3><hr>
<p>文件也是一个非常重要的抽象概念,<strong>它向应用程序提供了一个统一的视图</strong>,来看待系统中可能含有的所有各式各样的<code>I/O</code>设备.<strong>计算机文件系统通过文件与树形目录的抽象概念来屏蔽磁盘等物理设备所使用的数据块(<code>chunk</code>),让用户在使用文件的时候无需关心它实际的物理地址,用户也不需要管理磁盘上的空间分配,这些都由文件系统负责.</strong></p>
<p>所谓<strong>文件其实也就是一串字节序列</strong>,一个文件想要长期存储,就必须要存放于某种存储设备上,如本地磁盘、U盘.</p>
<h3 id="网络"><a href="#网络" class="headerlink" title="网络"></a>网络</h3><hr>
<p>如果用图论的方式来看待网络,<strong>其实网络就是一张无向图(需要双向通信),每台计算机都是图中的一个节点(指计算机网络),图的边就是计算机之间互相通信的连接.简单的说,计算机网络其实就是多台计算机进行通信的系统.</strong></p>
<p>网络其实也可以看作是一个<code>I/O</code>设备,当系统从内存中复制一串字节到网络适配器时,数据流经过网络传输到达另一台机器上(这其实就是输出操作),系统也可以读取从其他机器传输过来的数据,并把数据复制到内存中(输入).</p>
<p>互联网(<code>Internet</code>)是计算机网络中的一种(如果按区域划分还有局域网、广域网等),互联网是网络与网络之间组成的巨大的国际网络,这些网络之间以<code>TCP/IP</code>协议相连,连接了全世界上几十亿的设备.</p>
<p>我们日常生活中用浏览器上网浏览网页,其实使用的是万维网(<code>World Wide Web</code>),它是运行在互联网之上提供的一个服务,万维网是一个基于超文本链接组成的系统,并且通过<code>http</code>协议进行访问.</p>
<h4 id="OSI模型"><a href="#OSI模型" class="headerlink" title="OSI模型"></a>OSI模型</h4><hr>
<p><img src="http://wx2.sinaimg.cn/mw690/63503acbly1fjat0ftqf4j20fk0gz758.jpg"></p>
<p><code>OSI</code>模型全称为开放式系统互联通信参考模型(<code>Open System Interconnection Reference Model</code>),是由国际标准化组织提出的一个<strong>试图使各种计算机在世界范围内进行互联通信的标准框架.</strong></p>
<p>在<code>OSI</code>模型中,<strong>数据经过每一层都会添加该层的协议头(物理层除外)</strong>,当一个数据从一端发送到另一端时,需要经过层层封装.</p>
<ul>
<li><p>应用层: <strong>应用层直接和应用程序通信并提供常见的网络应用服务</strong>.常见的应用层协议有:HTTP,HTTPS,FTP,TELNET,SSH,SMTP,POP3等.</p>
</li>
<li><p>表示层: <strong>表示层为不同终端的上层用户提供数据和信息正确的语法表示变换方法</strong>.该层定义了数据格式及加解密,</p>
</li>
<li><p>会话层: <strong>会话层负责在数据传输中设置和维护网络中两台电脑之间的通信连接</strong>.但<strong>会话层不参与具体的传输</strong>,它只提供包括访问验证和会话管理在内的建立和维护应用之间通信的机制.</p>
</li>
<li><p>传输层: <strong>传输层将数据封装成数据包,提供端对端的数据通信服务</strong>.它还提供面向连接的数据流支持、可靠性、流量控制、多路复用等服务.最著名的传输层协议有<code>TCP</code>与<code>UDP</code>.</p>
</li>
<li><p>网络层: <strong>网络层提供路由和寻址的功能,使两终端系统能够互连且决定最佳路径,并具有一定的拥塞控制和流量控制的能力</strong>.网络层将网络表头(包含网络地址等数据)加到数据包中,网络层协议中最出名的就是<code>IP</code>协议.</p>
</li>
<li><p>数据链路层: <strong>数据链路层在两个网络实体之间提供数据链路连接的创建、维持和释放管理</strong>.它将数据划分为数据帧从一个节点传输到临近的另一个节点,这些节点是通过MAC(主机的物理地址)来进行标识的.</p>
</li>
<li><p>物理层: 物理层是<code>OSI</code>模型中最低的一层,物理层主要负责传输数据所需要的物理链路创建、维持、拆除，而提供具有机械的,电子的,功能的和规范的特性.简单来说,<strong>物理层负责了物理设备之间的通信传输.</strong></p>
</li>
</ul>
<h4 id="TCP-IP"><a href="#TCP-IP" class="headerlink" title="TCP/IP"></a>TCP/IP</h4><hr>
<p><code>TCP</code>协议全称为传输控制协议(<code>Transmission Control Protocol</code>),由于它是基于<code>IP</code>协议之上的,所以也有人称作为<code>TCP/IP</code>协议.</p>
<p><code>TCP</code>协议是位于传输层的协议,它与同样位于传输层的<code>UDP</code>协议差别很大,它保证了数据包在传输时的安全性(丢包重传),而<code>UDP</code>则只负责发送数据,不保证数据的安全.</p>
<p><code>TCP</code>为了保证不发生丢包,<strong>给每个包标记了一个序号,同时序号也保证了接收端在接收数据包时的顺序.然后接收端对已成功收到的包发回一个相应的确认(<code>ACK</code>)；如果发送端在合理的往返时延(<code>RTT</code>)内未收到确认,那么对应的数据包就被假设为已丢失将会被进行重传</strong>.<code>TCP</code>用一个校验和函数来检验数据是否有错误,在发送和接收时都要计算校验和.</p>
<p><code>TCP</code>协议在连接建立与终止时需要经过三次握手与四次挥手,这个机制主要都是为了提高可靠性.</p>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/3/3f/Connection_TCP.png" alt="三次握手"></p>
<ol>
<li><p>客户端发送SYN（SEQ=x）报文给服务器端,进入SYN_SEND状态,等待服务器端确认.</p>
</li>
<li><p>服务器端收到SYN报文,回应一个SYN （SEQ=y）ACK(ACK=x+1）报文,进入SYN_RECV状态.</p>
</li>
<li><p>客户端收到服务器端的SYN报文,回应一个ACK(ACK=y+1）报文,进入Established状态.</p>
</li>
<li><p>服务器接收到客户端发送的SYN报文,三次握手完成,连接建立.</p>
</li>
</ol>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/2/2d/Deconnection_TCP.png" alt="四次挥手"></p>
<ol>
<li><p>某一端首先调用close,称该端执行“主动关闭”（active close）.该端发送一个FIN报文,表示数据发送完毕(我们称它为<code>A</code>端).</p>
</li>
<li><p>另一端接收到这个FIN信号执行 “被动关闭”（passive close ),并回应一个ACK报文.(我们称它为<code>B</code>端)</p>
</li>
<li><p>一段时间后,<code>B</code>端没有数据发送的任务了,这时它将调用close关闭套接字,然后向<code>A</code>端发送一个FIN信号.</p>
</li>
<li><p><code>A</code>端接收到FIN信号,开始进行关闭连接,并对<code>B</code>端返回一个ACK.</p>
</li>
<li><p><code>B</code>端接收到来自<code>A</code>端的ACK信号,进行关闭连接,四次挥手完毕.</p>
</li>
</ol>
<p><code>TCP/IP</code>将<code>OSI</code>模型抽象成了四层,下图为以<code>HTTP</code>为例的一个数据发送过程.</p>
<p><img src="http://wx4.sinaimg.cn/mw690/63503acbly1fjat0gfjirj20q60s8gno.jpg"></p>
<h4 id="分组交换"><a href="#分组交换" class="headerlink" title="分组交换"></a>分组交换</h4><hr>
<p>数据包在网络中进行传输时使用了分组交换.分组交换也称为包交换,<strong>它将用户通信的数据划分成多个更小的等长数据段,在每个数据段的前面加上必要的控制信息作为数据段的首部,每个带有首部的数据段就构成了一个分组</strong>.首部指明了该分组发送的地址,当交换机收到分组之后,将根据首部中的地址信息将分组转发到目的地,这个过程就是分组交换.能够进行分组交换的通信网被称为分组交换网.</p>
<p><strong>分组交换的本质就是存储转发</strong>,它将所接受的分组暂时存储下来,在目的方向路由上排队,当它可以发送信息时,再将信息发送到相应的路由上,完成转发.其存储转发的过程就是分组交换的过程.</p>
<h3 id="数据的表示"><a href="#数据的表示" class="headerlink" title="数据的表示"></a>数据的表示</h3><hr>
<p>计算机编程语言拥有多种数据类型, 例如<code>int</code>、<code>char</code>、<code>double</code>等.<strong>但不管是什么类型的数据,在计算机中其实都只是一个字节序列(以8位二进制为一个字节)</strong>.每个机器中对字节序列的排序不大相同,有一些机器按照<strong>从最高有效字节到最低有效字节的顺序存储,这种规则被称为大端法</strong>;还有一些机器<strong>将最低有效字节排在最前面,这种规则被称为小端法</strong>.</p>
<p><strong>计算机使用补码来表示数值,一个数的最高有效位为符号位</strong>(以整数为例,整数占有4字节32位,最高位即最左位,剩下31位用于表示数字,所以整数的有效范围为<code>-2^31 ~ 2^31 - 1</code>),<strong>如果符号位为1,则代表这个值为负,如果符号位为0,则代表这个值为正.负数的补码即是它的反码(在保持符号位不变的前提下按位取反)+1,正数的补码不需要做其他操作,就是它本身的值.</strong></p>
<p><strong>当将一个较小类型的值强转为较大类型时(如<code>byte</code>强转为<code>int</code>),将会发生符号扩展,较小类型不包含的位会以符号位来进行填充</strong>(还是以<code>byte</code>为例,当它强转为<code>int</code>时,高24位会被填充为最高有效位中的数值,如果最高有效位为1,那么高24位都会为1,这时<code>byte</code>原来要表示的值将产生变化,要避免这种情况,可以使用一个低8位为1高24位为0的数,将它与强转后的结果进行<code>&amp;</code>操作,来保留低8位,并消除高24位中的1).</p>
<p>对一个数进行移位操作时,也需要按规则填充丢失的位数.<strong>移位操作分为算术移位与逻辑移位,算术移位会填充符号位,而逻辑移位全部填充0.</strong></p>
<ul>
<li><p>当进行左移操作时,右边空出的位用0补充,高位左移溢出则舍弃该高位.</p>
</li>
<li><p>当进行右移操作时,左边空出的位用符号位来补充(正数补0,负数补1),右边溢出则舍弃.如果使用逻辑移位(<code>Java</code>中为<code>&gt;&gt;&gt;</code>),左边空出的位会用0来补充.</p>
</li>
</ul>
<p>读到这里,可能有人会有疑问,为什么计算机非得使用补码?这主要因为,<strong>计算机中没有减法器只有加法器,而减去一个数其实就是加上一个负数,使用补码进行计算会很方便快速.</strong></p>
<p>我们假设一个指定<code>n</code>为长度的二进制序列,那么它将会有<code>2^n</code>个可能的值,加减法运算都存在上溢出与下溢出的情况,实际上都<strong>等价于<code>模(≡) 2^n</code>的加减法运算.</strong></p>
<p>把范围想象成一个时钟,假设现在时针指向数字3,若要得出6小时前时针指向的数字是几,有两种方法:</p>
<ol>
<li><p>将时针逆时针拨动6格.</p>
</li>
<li><p>将时针顺时针拨动12 - 6 = 6格.</p>
</li>
</ol>
<p>这里的12就是模,3小时-6小时 = 3小时 + (12 - 6)小时.</p>
<p>例如以下例子,模为<code>2^8 = 256</code></p>
<ul>
<li><p>一个8位无符号整数的值的范围是0到255.因此4+254将上溢出,结果为2: <code>(4 + 254) ≡ 258 ≡ 258 - 256 ≡ 2</code></p>
</li>
<li><p>一个8位有符号整数的值的范围是−128到127,则126+125将上溢出,结果为-5: <code>(126+125) ≡ 251 ≡ 251 - 256 ≡ -5</code></p>
</li>
</ul>
<h4 id="浮点数"><a href="#浮点数" class="headerlink" title="浮点数"></a>浮点数</h4><hr>
<p><strong>浮点数是一种对于实数的近似值数值表现法</strong>,由一个有效数字（即尾数）加上幂数来表示,通常是乘以某个基数的整数次指数得到.<strong>但浮点数计算通常伴随着因为无法精确表示而进行的近似或舍入.</strong></p>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/4/4d/Float_mantissa_exponent.png"></p>
<p>在计算机使用的浮点数被电气电子工程师协会（IEEE）规范化为IEEE-754,任意一个二进制浮点数V都可以表示成下列形式: </p>
<ul>
<li><p><code>V = (-1)^s * M * 2^E</code></p>
</li>
<li><p>${(-1)}^s$表示符号位,当s=0,V为正数;s=1,V为负数.</p>
</li>
<li><p>M 表示有效数字,$1≤M&lt;2$.</p>
</li>
<li><p>$2^E$表示指数位.</p>
</li>
</ul>
<p>这种表示方式有点类似于科学计数法,在计算机中,通常使用2为基数的幂数来表示.IEEE-754同时还规定了单精度(<code>float</code>)与双精度(<code>double</code>)的区别:</p>
<ul>
<li><p>32位的单精度浮点数,最高1位是符号位s,接着的8位是指数E,剩下的23位是有效数字M.</p>
</li>
<li><p>64位的双精度浮点数,最高1位是符号位s,接着的11位是指数E,剩下的52位为有效数字M.</p>
</li>
</ul>
<h3 id="函数调用"><a href="#函数调用" class="headerlink" title="函数调用"></a>函数调用</h3><hr>
<p><strong>当调用一个函数时,系统会在栈上分配一个空间,存放了函数中的局部变量、函数参数、返回地址等,这样的一个结构被称为栈帧.</strong></p>
<p>函数中的数据的存活状态是后进先出的,而栈正好是满足这一特性的数据结构,这也是为什么计算机使用栈来当作函数调用的存储结构.</p>
<figure class="highlight cpp"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">int</span> <span class="title">main</span><span class="params">()</span> </span>&#123;</span><br><span class="line">  sayHello();</span><br><span class="line">  <span class="keyword">return</span> <span class="number">0</span>;</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">void</span> <span class="title">sayHello</span><span class="params">()</span> </span>&#123;</span><br><span class="line">  hello_world();</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">void</span> <span class="title">hello_world</span><span class="params">()</span> </span>&#123;</span><br><span class="line">  print(<span class="string">&quot;Hello,World&quot;</span>);</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"> main()  sayHello()  hello_world()  print()</span><br><span class="line">   -                                main()</span><br><span class="line">   |</span><br><span class="line">   +&gt;     -                            sayHello()</span><br><span class="line">   .      |</span><br><span class="line">   .      +&gt;   -                              hello_world()</span><br><span class="line">   .      .    |</span><br><span class="line">   .      .    +&gt;   -                                  print()</span><br><span class="line">   .      .    .    |</span><br><span class="line">   .      .    +   &lt;-                       <span class="keyword">return</span> from print()</span><br><span class="line">   .      .    |</span><br><span class="line">   .      +   &lt;-                        <span class="keyword">return</span> from hello_world()</span><br><span class="line">   .      |</span><br><span class="line">   +     &lt;-                        <span class="keyword">return</span> from sayHello()</span><br><span class="line">   |</span><br><span class="line">   -                             <span class="keyword">return</span> from main()</span><br></pre></td></tr></table></figure>
<p><strong>在<code>x86-64</code>架构中,栈是向低地址方向生长的,寄存器<code>%rsp</code>指向栈顶</strong>,当一个函数被调用时,将会执行<code>pushq</code>指令,栈帧入栈,栈指针减小(向下生长),当函数返回后,将会执行<code>popq</code>指令,栈帧出栈,释放空间,栈指针增加.如果不断有函数进行调用,栈就会不断向下生长,最终会产生<code>Stack Overflow</code>.</p>
<h3 id="计算机编程语言"><a href="#计算机编程语言" class="headerlink" title="计算机编程语言"></a>计算机编程语言</h3><hr>
<p><strong>计算机编程语言是用来定义计算机程序的语言,它以一种标准化的语法规则来向计算机发出指令</strong>.最早的编程语言是在计算机发明之前产生的,当时是用来控制提花织布机及自动演奏钢琴的动作.如今已经有上千种不同的编程语言,不管是哪种语言,尽管它们的特性各有不同,但写程序的核心都是条件判断、循环、分支(这些也是机器指令的核心).</p>
<p>编程语言依赖于编译器或解释器(所以也分为编译型语言与解释型语言),如果没有对应的编译器/解释器来对语法与语义进行分析并生成对应的机器语言,那么我们所写的代码其实都只是普通的文本字符(编译器/解释器也会对源代码进行一系列优化提高性能).</p>
<p>编译型语言通过编译器直接将源代码翻译成机器语言并生成一个可执行文件(机器语言是不兼容的,如果要到另一台机器上运行,就需要对源代码重新编译);解释型语言通过解释器动态地翻译源代码并直接执行(性能上会比编译型语言直接运行可执行文件要差);虽然大多数的语言既可被编译又可被解译,但大多数仅在一种情况下能够良好运行.</p>
<p><code>Java</code>的编译机制比较特殊,它将<code>Java</code>源代码编译成<code>JVM</code>字节码(通过虚拟机来达到一次编译在所有平台可用),然后<code>JVM</code>对字节码进行解释执行,但对于较热的代码块(频繁调用的函数等),<code>JVM</code>会通过<code>JIT</code>即时编译技术将这些频繁使用的代码块动态地编译成机器语言,提高程序的性能.</p>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/08/20/2017-08-20-Encode/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/08/20/2017-08-20-Encode/" class="post-title-link" itemprop="url">编码的那点事儿</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-08-20 12:00:00" itemprop="dateCreated datePublished" datetime="2017-08-20T12:00:00+08:00">2017-08-20</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%90%8E%E7%AB%AF/" itemprop="url" rel="index"><span itemprop="name">后端</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E5%90%8E%E7%AB%AF/Java/" itemprop="url" rel="index"><span itemprop="name">Java</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <h3 id="什么是编码"><a href="#什么是编码" class="headerlink" title="什么是编码?"></a>什么是编码?</h3><hr>
<p>对于普通人来说,编码总是与一些秘密的东西相关联(加密与解密);对于程序员们来说,编码大多数是指一种用来在机器与人之间传递信息的方式.</p>
<p>但从广义上来讲,<strong>编码是从一种信息格式转换为另一种信息格式的过程,解码则是编码的逆向过程</strong>.接下来举几个使用到编码的例子: </p>
<ul>
<li><p>当我们要把想表达的意思通过一种语言表达出来,其实就是在脑海中对信息进行了一次编码,而对方如果也懂得这门语言,那么就可以用这门语言的解码方法(语法规则)来获得信息(日常的说话交流其实就是在编码与解码).</p>
</li>
<li><p>程序员写程序时,其实就是在将自己的想法通过计算机语言进行编码,而编译器则通过生成抽象语法树,词义分析等操作进行解码,最终交给计算机执行程序(编译器产生的解码结果并不是最终结果,一般为汇编语言,但汇编语言只是CPU指令集的助记符,还需要再进行解码).</p>
</li>
</ul>
<ul>
<li>计算机只有两种状态(0和1),要想存储和传输多媒体信息,就需要用到编码和解码.</li>
</ul>
<ul>
<li>对数据进行压缩,其本质就是以减少自身占用的空间为前提进行重新编码.</li>
</ul>
<p>了解了编码的含义,我们接下来重点探究<code>Java</code>中的字符编码.</p>
<blockquote>
<p>本文作者为: <a target="_blank" rel="noopener" href="https://github.com/SylvanasSun/">SylvanasSun</a>.转载请务必将下面这段话置于文章开头处(保留超链接).<br>本文首发自<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun Blog</a>,原文链接: <a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/08/20/2017-08-20-Encode/">https://sylvanassun.github.io/2017/08/20/2017-08-20-Encode/</a></p>
</blockquote>
<h3 id="常见的字符集"><a href="#常见的字符集" class="headerlink" title="常见的字符集"></a>常见的字符集</h3><hr>
<p><strong>字符集就是字符与二进制的映射表</strong>,每一个字符集都有自己的编码规则,每个字符所占用的字节也不同(支持的字符越多每个字符占用的字节也就越多).</p>
<ul>
<li><p>ASCII : 美国信息交换标准码(American Standard Code for Information Interchange).学过计算机的都知道大名鼎鼎的<code>ASCII</code>码,它是基于拉丁字母的字符集,总共记有128个字符,主要目的是显示英语.其中每个字符占用一个字节(只用到了低7位).</p>
<p> <img src="http://wx4.sinaimg.cn/large/63503acbly1fith5ayrgdj20n50ct0tj.jpg"></p>
</li>
</ul>
<ul>
<li><p>ISO-8859-1 : 它是由国际标准化组织(International Standardization Organization)在<code>ASCII</code>基础上制定的8位字符集(仍然是单字节编码).它在<code>ASCII</code>空置的<code>0xA0-0xFF</code>范围内加入了96个字母与符号,支持了欧洲部分国家的语言.</p>
<p> <img src="http://wx1.sinaimg.cn/large/63503acbly1fith5be3u3j20o80bcgmk.jpg"></p>
</li>
</ul>
<ul>
<li><p>GBK : 如果我们想要让电脑上显示汉字就必须要有支持汉字的字符集,GBK就是这样一个支持汉字的字符集,全称为&lt;&lt;汉字内码扩展规范&gt;&gt;,它的编码方式分为单字节与双字节: <code>00–7F</code>范围内是第一个字节,与<code>ASCII</code>保持一致,之后的双字节中,前一字节是双字节的第一位(范围在<code>81–FE</code>,不包含<code>80</code>和<code>FF</code>),第二字节的一部分在<code>40–7E</code>,其他部分在<code>80–FE</code>.(这里不再介绍<code>GB2313</code>与<code>GB18030</code>,它们都是互相兼容的.)</p>
<p> <img src="http://wx1.sinaimg.cn/large/63503acbly1fith5bqcprj20r00bcjsi.jpg"></p>
</li>
<li><p>UTF-16 : <code>UTF-16</code>是<code>Unicode(统一码,一种以支持世界上多国语言为目的的通用字符集)</code>的一种实现方式,它把<code>Unicode</code>的抽象码位<strong>映射为<code>2~4</code>个字节来表示</strong>,<strong><code>UTF-16</code>是变长编码(<code>UTF-32是真正的定长编码</code>)</strong>,但在最开始以前<code>UTF-16</code>是用来配合<code>UCS-2(UTF-16的子集,它是定长编码,用2个字节表示所有Unicode字符)</code>使用的,主要原因还是因为当时<code>Unicode</code>只有不到65536个字符,2个字节就足以应对一切了.后来,<code>Unicode</code>支持的字符不断膨胀,2个字节已经不够用了,导致一些只支持<code>UCS-2</code>当做内码的产品很尴尬(<code>Java</code>就是其中之一).</p>
<p> <img src="http://wx4.sinaimg.cn/large/63503acbly1fith5c3ib8j20qg0ck3zq.jpg"></p>
</li>
<li><p>UTF-8 : <strong><code>UTF-8</code>也是基于<code>Unicode</code>的变长编码表</strong>,它使用<code>1~6</code>个字节来为每个字符进行编码(<code>RFC 3629</code>对<code>UTF-8</code>进行了重新规范,只能使用原来<code>Unicode</code>定义的区域,<code>U+0000~U+10FFFF</code>,也就是说最多只有4个字节),<code>UTF-8</code>完全兼容<code>ASCII</code>,它的编码规则如下:</p>
<ul>
<li><p>在<code>U+0000~U+007F</code>范围内,只需要一个字节(也就是<code>ASCII</code>字符集中的字符).</p>
</li>
<li><p>在<code>U+0080~U+07FF</code>范围内,需要两个字节(希腊文、阿拉伯文、希伯来文等).</p>
</li>
<li><p>在<code>U+0800~U+FFFF</code>范围内,需要三个字节(亚洲汉字等).</p>
</li>
<li><p>其他的字符使用四个字节.</p>
</li>
</ul>
</li>
</ul>
<p><img src="http://wx2.sinaimg.cn/large/63503acbly1fith5cmmpbj20w008ot9i.jpg"></p>
<h3 id="Java中字符的编解码"><a href="#Java中字符的编解码" class="headerlink" title="Java中字符的编解码"></a>Java中字符的编解码</h3><hr>
<p><code>Java</code>提供了<code>Charset</code>类来完成对字符的编码与解码,主要使用以下函数: </p>
<ul>
<li><code>public static Charset forName(String charsetName)</code> : 这是一个静态工厂函数,它根据传入的字符集名称来返回对应字符集的<code>Charset</code>类.</li>
</ul>
<ul>
<li><code>public final ByteBuffer encode(CharBuffer cb) / public final ByteBuffer encode(String str)</code> : 编码函数,它将传入的字符串或者字符序列进行编码,返回的<code>ByteBuffer</code>是一个字节缓冲区.</li>
</ul>
<ul>
<li><code>public final CharBuffer decode(ByteBuffer bb)</code> : 解码函数,将传入的字节序列解码为字符序列.</li>
</ul>
<h4 id="示例代码"><a href="#示例代码" class="headerlink" title="示例代码"></a>示例代码</h4><hr>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> String text = <span class="string">&quot;Hello,编码!&quot;</span>;</span><br><span class="line"></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> Charset ASCII = Charset.forName(<span class="string">&quot;ASCII&quot;</span>);</span><br><span class="line"></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> Charset ISO_8859_1 = Charset.forName(<span class="string">&quot;ISO-8859-1&quot;</span>);</span><br><span class="line"></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> Charset GBK = Charset.forName(<span class="string">&quot;GBK&quot;</span>);</span><br><span class="line"></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> Charset UTF_16 = Charset.forName(<span class="string">&quot;UTF-16&quot;</span>);</span><br><span class="line"></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> Charset UTF_8 = Charset.forName(<span class="string">&quot;UTF-8&quot;</span>);</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">encodeAndPrint</span><span class="params">(Charset charset)</span> </span>&#123;</span><br><span class="line">	System.out.println(charset.name() + <span class="string">&quot;: &quot;</span>);</span><br><span class="line">	printHex(text.toCharArray(), charset);</span><br><span class="line">	System.out.println(<span class="string">&quot;----------------------------------&quot;</span>);</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">printHex</span><span class="params">(<span class="keyword">char</span>[] chars, Charset charset)</span> </span>&#123;</span><br><span class="line">	System.out.println(<span class="string">&quot;ForEach: &quot;</span>);</span><br><span class="line">	ByteBuffer byteBuffer;</span><br><span class="line">	<span class="keyword">byte</span>[] bytes;</span><br><span class="line">	<span class="keyword">if</span> (chars != <span class="keyword">null</span>) &#123;</span><br><span class="line">		<span class="keyword">for</span> (<span class="keyword">char</span> c : chars) &#123;</span><br><span class="line">			System.out.print(<span class="string">&quot;char: &quot;</span> + Integer.toHexString(c) + <span class="string">&quot; &quot;</span>);</span><br><span class="line">			<span class="comment">// 打印出字符编码后对应的字节</span></span><br><span class="line">			byteBuffer = charset.encode(String.valueOf(c));</span><br><span class="line">			bytes = byteBuffer.array();</span><br><span class="line">			System.out.print(<span class="string">&quot;byte: &quot;</span>);</span><br><span class="line">			<span class="keyword">if</span> (bytes != <span class="keyword">null</span>) &#123;</span><br><span class="line">				<span class="keyword">for</span> (<span class="keyword">byte</span> b : bytes)</span><br><span class="line">					System.out.print(Integer.toHexString(b &amp; <span class="number">0xFF</span>) + <span class="string">&quot; &quot;</span>);</span><br><span class="line">			&#125;</span><br><span class="line">			System.out.println();</span><br><span class="line">		&#125;</span><br><span class="line">	&#125;</span><br><span class="line">	System.out.println();</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>有的读者可能会对以上代码中的<code>b &amp; 0xFF</code>产生疑惑,这是为了解决符号扩展问题.在<code>Java</code>中,<strong>如果一个窄类型强转为一个宽类型时,会对多出来的空位进行符号扩展(如果符号位为1,就补1,为0则补0)</strong>.只有<code>char</code>类型除外,<code>char</code>是没有符号位的,所以它永远都是补0.</p>
<p>代码中调用了函数<code>Integer.toHexString()</code>,变量<code>b</code>在运算之前就已经被强转为了<code>int</code>类型,为了让数值不受到破坏,我们让<code>b</code>对<code>0xFF</code>进行了与运算,<code>0xFF</code>是一个低八位都为1的值(其他位都为0),而<code>byte</code>的有效范围只在低八位,所以结果为前24位(除符号位)都变为了0,低八位保留了原有的值.</p>
<p>如果不做这项操作,那么<code>b</code>又恰好是个负数的话,那这个强转后的<code>int</code>的前24位都会变为1,这个结果显然已经破坏了原有的值.</p>
<h3 id="IO中的字符编码"><a href="#IO中的字符编码" class="headerlink" title="IO中的字符编码"></a>IO中的字符编码</h3><hr>
<p><code>Reader</code>与<code>Writer</code>是<code>Java</code>中负责字符输入与输出的抽象基类,它们的子类实现了在各种场景中的字符输入输出功能.</p>
<p>在使用<code>Reader</code>与<code>Writer</code>进行<code>IO</code>操作时,需要指定字符集,如果不显式指定的话会默认使用当前环境的字符集,但我还是推荐显式指定<strong>一致的字符集</strong>,这样才不会出现乱码问题(<code>Reader</code>与<code>Writer</code>指定的字符集不一致或更改了环境导致字符集不一致等).</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">writeChar</span><span class="params">(String content, String filename, String charset)</span> </span>&#123;</span><br><span class="line">	OutputStreamWriter writer = <span class="keyword">null</span>;</span><br><span class="line"></span><br><span class="line">	<span class="keyword">try</span> &#123;</span><br><span class="line">		FileOutputStream outputStream = <span class="keyword">new</span> FileOutputStream(filename);</span><br><span class="line">		writer = <span class="keyword">new</span> OutputStreamWriter(outputStream, charset);</span><br><span class="line">		writer.write(content);</span><br><span class="line">	&#125; <span class="keyword">catch</span> (IOException e) &#123;</span><br><span class="line">		e.printStackTrace();</span><br><span class="line">	&#125; <span class="keyword">finally</span> &#123;</span><br><span class="line">		<span class="keyword">try</span> &#123;</span><br><span class="line">			<span class="keyword">if</span> (writer != <span class="keyword">null</span>)</span><br><span class="line">				writer.close();</span><br><span class="line">		&#125; <span class="keyword">catch</span> (IOException e) &#123;</span><br><span class="line">			e.printStackTrace();</span><br><span class="line">		&#125;</span><br><span class="line">	&#125;</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">public</span> <span class="keyword">static</span> String <span class="title">readChar</span><span class="params">(String filename, String charset)</span> </span>&#123;</span><br><span class="line">	InputStreamReader reader = <span class="keyword">null</span>;</span><br><span class="line">	StringBuilder sb = <span class="keyword">null</span>;</span><br><span class="line"></span><br><span class="line">	<span class="keyword">try</span> &#123;</span><br><span class="line">		FileInputStream inputStream = <span class="keyword">new</span> FileInputStream(filename);</span><br><span class="line">		reader = <span class="keyword">new</span> InputStreamReader(inputStream, charset);</span><br><span class="line">		<span class="keyword">char</span>[] buf = <span class="keyword">new</span> <span class="keyword">char</span>[<span class="number">64</span>];</span><br><span class="line">		<span class="keyword">int</span> count = <span class="number">0</span>;</span><br><span class="line">		sb = <span class="keyword">new</span> StringBuilder();</span><br><span class="line">		<span class="keyword">while</span> ((count = reader.read(buf)) != -<span class="number">1</span>)</span><br><span class="line">			sb.append(buf, <span class="number">0</span>, count);</span><br><span class="line">	&#125; <span class="keyword">catch</span> (IOException e) &#123;</span><br><span class="line">		e.printStackTrace();</span><br><span class="line">	&#125; <span class="keyword">finally</span> &#123;</span><br><span class="line">		<span class="keyword">try</span> &#123;</span><br><span class="line">			<span class="keyword">if</span> (reader != <span class="keyword">null</span>)</span><br><span class="line">				reader.close();</span><br><span class="line">		&#125; <span class="keyword">catch</span> (IOException e) &#123;</span><br><span class="line">			e.printStackTrace();</span><br><span class="line">		&#125;</span><br><span class="line">	&#125;</span><br><span class="line"></span><br><span class="line">	<span class="keyword">return</span> sb.toString();</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>

<h3 id="Web中的字符编码"><a href="#Web中的字符编码" class="headerlink" title="Web中的字符编码"></a>Web中的字符编码</h3><hr>
<p>在<code>Web</code>开发中,乱码也是经常存在的一个问题,主要体现在请求的参数和返回的响应结果,最头疼的是不同的浏览器的默认编码甚至还不一致.</p>
<p><code>Java</code>以<code>Http</code>的请求与响应抽象出了<code>Request</code>和<code>Response</code>两个对象,只要保持<strong>请求与响应的编码一致</strong>就能避免乱码问题.</p>
<p><code>Request</code>提供了<code>setCharacterEncoding(String encode)</code>函数来改变请求体的编码,一般通过写一个过滤器来统一对所有请求设置编码.</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">request.setCharacterEncoding(<span class="string">&quot;UTF-8&quot;</span>);</span><br></pre></td></tr></table></figure>
<p><code>Response</code>提供了<code>setCharacterEncoding(String encode)</code>与<code>setHeader(String name,String value)</code>两个函数,它们都可以设置响应的编码.</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">response.setCharacterEncoding(<span class="string">&quot;UTF-8&quot;</span>);</span><br><span class="line"><span class="comment">// 设置响应头的编码信息,同时也告知了浏览器该如何解码</span></span><br><span class="line">response.setHeader(<span class="string">&quot;Content-Type&quot;</span>,<span class="string">&quot;text/html;charset=UTF-8&quot;</span>); </span><br></pre></td></tr></table></figure>
<p>还有一种更简便的方式,直接使用<code>Spring</code>提供的<code>CharacterEncodingFilter</code>,该过滤器就是用来统一编码的.</p>
<figure class="highlight xml"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag">&lt;<span class="name">filter</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">filter-name</span>&gt;</span>charsetFilter<span class="tag">&lt;/<span class="name">filter-name</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">filter-class</span>&gt;</span>org.springframework.web.filter.CharacterEncodingFilter<span class="tag">&lt;/<span class="name">filter-class</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">init-param</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">param-name</span>&gt;</span>encoding<span class="tag">&lt;/<span class="name">param-name</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">param-value</span>&gt;</span>UTF-8<span class="tag">&lt;/<span class="name">param-value</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;/<span class="name">init-param</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">init-param</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">param-name</span>&gt;</span>forceEncoding<span class="tag">&lt;/<span class="name">param-name</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">param-value</span>&gt;</span>true<span class="tag">&lt;/<span class="name">param-value</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;/<span class="name">init-param</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">filter</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">filter-mapping</span>&gt;</span></span><br><span class="line">   <span class="tag">&lt;<span class="name">filter-name</span>&gt;</span>charsetFilter<span class="tag">&lt;/<span class="name">filter-name</span>&gt;</span></span><br><span class="line">   <span class="tag">&lt;<span class="name">url-pattern</span>&gt;</span>*<span class="tag">&lt;/<span class="name">url-pattern</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">filter-mapping</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p><code>CharacterEncodingFilter</code>的实现如下: </p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">CharacterEncodingFilter</span> <span class="keyword">extends</span> <span class="title">OncePerRequestFilter</span> </span>&#123;</span><br><span class="line">    <span class="keyword">private</span> String encoding;</span><br><span class="line">    <span class="keyword">private</span> <span class="keyword">boolean</span> forceEncoding = <span class="keyword">false</span>;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="title">CharacterEncodingFilter</span><span class="params">()</span> </span>&#123;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">setEncoding</span><span class="params">(String encoding)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">this</span>.encoding = encoding;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">setForceEncoding</span><span class="params">(<span class="keyword">boolean</span> forceEncoding)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">this</span>.forceEncoding = forceEncoding;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">protected</span> <span class="keyword">void</span> <span class="title">doFilterInternal</span><span class="params">(HttpServletRequest request, HttpServletResponse response, FilterChain filterChain)</span> <span class="keyword">throws</span> ServletException, IOException </span>&#123;</span><br><span class="line">        <span class="keyword">if</span>(<span class="keyword">this</span>.encoding != <span class="keyword">null</span> &amp;&amp; (<span class="keyword">this</span>.forceEncoding || request.getCharacterEncoding() == <span class="keyword">null</span>)) &#123;</span><br><span class="line">            request.setCharacterEncoding(<span class="keyword">this</span>.encoding);</span><br><span class="line">            <span class="keyword">if</span>(<span class="keyword">this</span>.forceEncoding) &#123;</span><br><span class="line">                response.setCharacterEncoding(<span class="keyword">this</span>.encoding);</span><br><span class="line">            &#125;</span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line">        filterChain.doFilter(request, response);</span><br><span class="line">    &#125;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>

<h3 id="为什么Char在Java中占用两个字节"><a href="#为什么Char在Java中占用两个字节" class="headerlink" title="为什么Char在Java中占用两个字节?"></a>为什么Char在Java中占用两个字节?</h3><hr>
<p>众所周知,在<code>Java</code>中一个<code>char</code>类型占用两个字节,那么这是为什么呢?这是因为<code>Java</code>使用了<code>UTF-16</code>当作内码.</p>
<p><strong>内码(<code>Internal Encoding</code>)就是程序内部所使用的编码</strong>,主要在于编程语言实现其<code>char</code>和<code>String</code>类型在内存中使用的内部编码.与之相对的就是<strong>外码(<code>External Encoding</code>),它是程序与外部交互时使用的字符编码</strong>.</p>
<p>值得一提的是,当初<code>UTF-16</code>是配合<code>UCS-2</code>使用的,后来<code>Unicode</code>支持的字符不断增多,<code>UTF-16</code>也不再只当作一个定长的2字节编码使用了,也就是说,<strong><code>Java</code>中的一个<code>char</code>其实并不一定能代表一个完整的<code>UTF-16</code>字符.</strong></p>
<p><code>String.getBytes()</code>可以将该String的内码转换为指定的外码并返回这个编完码的字节数组(无参数版使用当前平台的默认编码).</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">main</span><span class="params">(String[] args)</span> <span class="keyword">throws</span> UnsupportedEncodingException </span>&#123;</span><br><span class="line">	String text = <span class="string">&quot;码&quot;</span>;</span><br><span class="line">	<span class="keyword">byte</span>[] bytes = text.getBytes(<span class="string">&quot;UTF-8&quot;</span>); </span><br><span class="line">	System.out.println(bytes.length); <span class="comment">// 输出3</span></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p><code>Java</code>还规定<code>char</code>与<code>String</code>类型的序列化是使用<code>UTF-8</code>当作外码的,<code>Java</code>中的<code>Class</code>文件中的字符串常量与符号名也都规定使用<code>UTF-8</code>.这种设计是为了平衡运行时的时间效率与外部存储的空间效率所做的取舍.</p>
<p>在<code>SUN JDK6</code>中,有一条命令<code>-XX:+UseCompressedString</code>.该命令可以让<code>String</code>内部存储字符内容可能用<code>byte[]</code>也可能用<code>char[]</code>: 当整个字符串所有字符处于<code>ASCII</code>字符集范围内时,就使用<code>byte[]</code>(使用了<code>ASCII</code>编码)来存储,如果有任一字符超过了<code>ASCII</code>的范围,就退回到使用<code>char[]</code>(<code>UTF-16</code>编码)来存储.但是这个功能实现的并不理想,所以没有包含在<code>Open JDK6</code>/<code>Open JDK7</code>/<code>Oracle JDK7</code>等后续版本中.</p>
<p><code>JavaScript</code>也使用了<code>UTF-16</code>作为内码,其实现也广泛应用了<code>CompressedString</code>的思想,主流的<code>JavaScript</code>引擎中都会尽可能使用<code>ASCII</code>内码的字符串,不过这些细节都是对外隐藏的..</p>
<h3 id="参考文献"><a href="#参考文献" class="headerlink" title="参考文献"></a>参考文献</h3><hr>
<ul>
<li><p><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/ASCII">ASCII - Wikipedia</a></p>
</li>
<li><p><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/ISO/IEC_8859-1">ISO/IEC 8859-1 - Wikipedia</a></p>
</li>
</ul>
<ul>
<li><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/GBK">GBK - Wikipedia</a></li>
</ul>
<ul>
<li><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/UTF-16">UTF-16 - Wikipedia</a></li>
</ul>
<ul>
<li><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/UTF-8">UTF-8 - Wikipedia</a></li>
</ul>
<ul>
<li><a target="_blank" rel="noopener" href="https://www.zhihu.com/question/27562173/answer/37188642">Java 语言中一个字符占几个字节？ - RednaxelaFX的回答</a></li>
</ul>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/08/13/2017-08-13-BTrees/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/08/13/2017-08-13-BTrees/" class="post-title-link" itemprop="url">B树的那点事儿</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-08-13 12:00:00" itemprop="dateCreated datePublished" datetime="2017-08-13T12:00:00+08:00">2017-08-13</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/Algorithms/" itemprop="url" rel="index"><span itemprop="name">Algorithms</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/Algorithms/%E6%95%B0%E6%8D%AE%E7%BB%93%E6%9E%84/" itemprop="url" rel="index"><span itemprop="name">数据结构</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/Algorithms/%E6%95%B0%E6%8D%AE%E7%BB%93%E6%9E%84/Tree/" itemprop="url" rel="index"><span itemprop="name">Tree</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <h3 id="概述"><a href="#概述" class="headerlink" title="概述"></a>概述</h3><hr>
<p>B树(<code>B-Tree</code>)是一种自平衡的树,能够保证数据有序.同时它还保证了在查找、插入、删除等操作时性能都能保持在$O(log;n)$.需要注意的一点是,<strong><code>B-Tree</code>并不是一棵自平衡的二叉查找树,它拥有多个分叉,且为大块数据的读写操作做了优化,同时它也可以用来描述外部存储(支持对保存在磁盘或者网络上的符号表进行外部查找).</strong></p>
<p>在当今的互联网环境下,数据量已经大到无法想象,而能够在巨型数据集合中快速地进行查找操作是非常重要的,而<code>B-Tree</code>的神奇之处正在于: 只需要使用4~5个指向一小块数据的引用即可有效支持在数百亿甚至更多元素的符号表中进行查找和插入等操作.</p>
<p><code>B-Tree</code>的主要应用在于文件系统与数据库系统,例如<code>Mysql</code>中的<code>InnoDB</code>存储引擎就使用到了<code>B-Tree</code>来实现索引.</p>
<blockquote>
<p>本文作者为: <a target="_blank" rel="noopener" href="https://github.com/SylvanasSun">SylvanasSun</a>.转载请务必将下面这段话置于文章开头处(保留超链接).<br>本文转发自<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun Blog</a>,原文链接: <a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/08/13/2017-08-13-BTrees/">https://sylvanassun.github.io/2017/08/13/2017-08-13-BTrees/</a></p>
</blockquote>
<h3 id="数据表示"><a href="#数据表示" class="headerlink" title="数据表示"></a>数据表示</h3><hr>
<p>我们使用页来表示一块连续的数据,访问一页的数据需要将它读入本地内存.一个页可能是本地计算机上的一个文件,也可能是服务器上的某个文件的一部分等等.页的访问次数(无论读写)即是外部查找算法的成本模型.</p>
<p>首先,构造一棵<code>B-Tree</code><strong>不会将数据保存在树中</strong>,而是会构造一棵<strong>由键的副本组成的树,每个副本都关联着一条链接</strong>.这种方法能够将索引与符号表进行分离,同时我们还需要遵循以下的规定: </p>
<ul>
<li>选择一个参数<code>M</code>来构造一棵多向树(<code>M</code>一般为偶数),每个节点最多含有<code>M - 1</code>对键和链接.</li>
</ul>
<ul>
<li>每个节点最少含有<code>M / 2</code>对键和链接,根节点例外(它最少可以含有2对).</li>
</ul>
<ul>
<li>.使用<code>M</code>阶的<code>B-Tree</code>来指定<code>M</code>的值,例如: 在一棵4阶<code>B-Tree</code>中,每个节点都含有至少2对至多3对.</li>
</ul>
<ul>
<li><code>B-Tree</code>含有两种不同类型的节点,内部节点与外部节点.</li>
</ul>
<ul>
<li>内部节点含有与页相关联的键的副本: 每个键都与一个节点相关联(一条链接),以此节点为根的子树中,所有的键都大于等于与此节点关联的键,但小于原内部节点中更大的键(如果存在的话).</li>
</ul>
<ul>
<li>外部节点含有指向实际数据的引用: 每个键都对应着实际的值,外部节点就是一张普通的符号表.</li>
</ul>
<p><img src="http://wx3.sinaimg.cn/large/63503acbly1fihx6539d3j21kw0m9adu.jpg"></p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">// max children per B-tree node = M - 1</span></span><br><span class="line"><span class="comment">// must be even and greater than 2</span></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> <span class="keyword">int</span> M = <span class="number">4</span>;</span><br><span class="line"></span><br><span class="line"><span class="comment">// root of the B-tree</span></span><br><span class="line"><span class="keyword">private</span> Node root;</span><br><span class="line"></span><br><span class="line"><span class="comment">// height of the B-tree</span></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">int</span> height;</span><br><span class="line"></span><br><span class="line"><span class="comment">// number of key-value paris int the B-tree</span></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">int</span> N;</span><br><span class="line"></span><br><span class="line"><span class="comment">// B-tree node data type</span></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> <span class="class"><span class="keyword">class</span> <span class="title">Node</span> </span>&#123;</span><br><span class="line">	<span class="keyword">private</span> <span class="keyword">int</span> children_length;</span><br><span class="line">	<span class="keyword">private</span> Entry[] children = <span class="keyword">new</span> Entry[M];</span><br><span class="line"></span><br><span class="line">	<span class="comment">// create a node with k children</span></span><br><span class="line">	<span class="function"><span class="keyword">private</span> <span class="title">Node</span><span class="params">(<span class="keyword">int</span> k)</span> </span>&#123;</span><br><span class="line">		children_length = k;</span><br><span class="line">	&#125;</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="comment">// internal nodes : only use key and next</span></span><br><span class="line"><span class="comment">// external nodes : only use key and value</span></span><br><span class="line"><span class="keyword">private</span> <span class="keyword">static</span> <span class="class"><span class="keyword">class</span> <span class="title">Entry</span> </span>&#123;</span><br><span class="line">	<span class="keyword">private</span> Comparable key;</span><br><span class="line">	<span class="keyword">private</span> <span class="keyword">final</span> Object value;</span><br><span class="line">	<span class="keyword">private</span> Node next;</span><br><span class="line"></span><br><span class="line">	<span class="function"><span class="keyword">private</span> <span class="title">Entry</span><span class="params">(Comparable key, Object value, Node next)</span> </span>&#123;</span><br><span class="line">		<span class="keyword">this</span>.key = key;</span><br><span class="line">		<span class="keyword">this</span>.value = value;</span><br><span class="line">		<span class="keyword">this</span>.next = next;</span><br><span class="line">	&#125;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>

<h3 id="查找"><a href="#查找" class="headerlink" title="查找"></a>查找</h3><hr>
<p>在<code>B-Tree</code>中进行查找操作每次都会结束于一个外部节点.在查找时,<strong>从根节点开始,根据被查找的键来选择当前节点中的适当区间并根据对应的链接从一个节点移动到下一层节点</strong>.最终,查找过程会到达树底的一个含有键的页(也就是外部节点),如果被查找的键在该页中,查找命中并结束,如果不在,则查找未命中.</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">public</span> Value <span class="title">get</span><span class="params">(Key key)</span> </span>&#123;</span><br><span class="line">	validateKey(key, <span class="string">&quot;argument key to get() is null.&quot;</span>);</span><br><span class="line">	<span class="keyword">return</span> search(root, key, height);</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">private</span> Value <span class="title">search</span><span class="params">(Node x, Key key, <span class="keyword">int</span> height)</span> </span>&#123;</span><br><span class="line">	<span class="keyword">while</span> (x != <span class="keyword">null</span>) &#123;</span><br><span class="line">		Entry[] children = x.children;</span><br><span class="line">		<span class="keyword">int</span> children_length = x.children_length;</span><br><span class="line"></span><br><span class="line">		<span class="comment">// 当树的高度已经递减为0时,也就到达了树的底部(一个外部节点)</span></span><br><span class="line">		<span class="comment">// 遍历当前节点的每个键进行比较,如果找到则查找命中返回对应的值.</span></span><br><span class="line">		<span class="keyword">if</span> (height == <span class="number">0</span>) &#123;</span><br><span class="line">			<span class="keyword">for</span> (<span class="keyword">int</span> j = <span class="number">0</span>; j &lt; children_length; j++) &#123;</span><br><span class="line">				<span class="keyword">if</span> (eq(key, children[j].key))</span><br><span class="line">					<span class="keyword">return</span> (Value) children[j].value;</span><br><span class="line">			&#125;</span><br><span class="line">		&#125; <span class="keyword">else</span> &#123;</span><br><span class="line">			<span class="comment">// 当还是内部节点时,根据键来查找适当的区间</span></span><br><span class="line">			<span class="keyword">for</span> (<span class="keyword">int</span> j = <span class="number">0</span>; j &lt; children_length; j++) &#123;</span><br><span class="line">				<span class="keyword">if</span> (j + <span class="number">1</span> == children_length || less(key, children[j + <span class="number">1</span>].key)) &#123;</span><br><span class="line">					<span class="comment">// 找到适当的区间后,移动到下一层节点</span></span><br><span class="line">					x = children[j].next;</span><br><span class="line">					height--;</span><br><span class="line">					<span class="keyword">break</span>;</span><br><span class="line">				&#125;</span><br><span class="line">			&#125;</span><br><span class="line">		&#125;</span><br><span class="line">	&#125;</span><br><span class="line">	<span class="keyword">return</span> <span class="keyword">null</span>;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>

<h3 id="插入"><a href="#插入" class="headerlink" title="插入"></a>插入</h3><hr>
<p>插入操作也要先从根节点不断递归地查找到合适的区间,但需要注意一点,如果查找到的外部节点已经满了怎么办呢?</p>
<p>解决方法也很简单,我们允许被插入的节点暂时”溢出”,然后在递归调用自底向上不断地进行分裂.例如:当<code>M</code>为5时,根节点溢出为<code>6-节点</code>,只需要将它分裂为连接了两个<code>3-节点</code>的<code>2-节点</code>.即将一个<code>M-</code>的父节点<code>k</code>分裂为连接着两个<code>(M / 2)-</code>节点的<code>(k + 1)-</code>节点.</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">put</span><span class="params">(Key key, Value value)</span> </span>&#123;</span><br><span class="line">	validateKey(key, <span class="string">&quot;argument key to put() is null.&quot;</span>);</span><br><span class="line"></span><br><span class="line">	Node u = insert(root, key, value, height);</span><br><span class="line">	N++;</span><br><span class="line">	<span class="keyword">if</span> (u == <span class="keyword">null</span>)</span><br><span class="line">		<span class="keyword">return</span>;</span><br><span class="line"></span><br><span class="line">	<span class="comment">// need to split root</span></span><br><span class="line">	Node t = <span class="keyword">new</span> Node(<span class="number">2</span>);</span><br><span class="line">	t.children[<span class="number">0</span>] = <span class="keyword">new</span> Entry(root.children[<span class="number">0</span>].key, <span class="keyword">null</span>, root);</span><br><span class="line">	t.children[<span class="number">1</span>] = <span class="keyword">new</span> Entry(u.children[<span class="number">0</span>].key, <span class="keyword">null</span>, u);</span><br><span class="line">	root = t;</span><br><span class="line">	height++;</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">private</span> Node <span class="title">insert</span><span class="params">(Node x, Key key, Value value, <span class="keyword">int</span> height)</span> </span>&#123;</span><br><span class="line">	<span class="keyword">int</span> j;</span><br><span class="line">	Entry t = <span class="keyword">new</span> Entry(key, value, <span class="keyword">null</span>);</span><br><span class="line">	Entry[] children = x.children;</span><br><span class="line">	<span class="keyword">int</span> children_length = x.children_length;</span><br><span class="line"></span><br><span class="line">	<span class="comment">// external node</span></span><br><span class="line">	<span class="keyword">if</span> (height == <span class="number">0</span>) &#123;</span><br><span class="line">		<span class="keyword">for</span> (j = <span class="number">0</span>; j &lt; children_length; j++) &#123;</span><br><span class="line">			<span class="keyword">if</span> (less(key, children[j].key))</span><br><span class="line">				<span class="keyword">break</span>;</span><br><span class="line">		&#125;</span><br><span class="line">	&#125; <span class="keyword">else</span> &#123;</span><br><span class="line">		<span class="comment">// internal node</span></span><br><span class="line">		<span class="keyword">for</span> (j = <span class="number">0</span>; j &lt; children_length; j++) &#123;</span><br><span class="line">			<span class="keyword">if</span> (j + <span class="number">1</span> == children_length || less(key, children[j + <span class="number">1</span>].key)) &#123;</span><br><span class="line">				<span class="comment">// 找到合适的区间后继续递归调用</span></span><br><span class="line">				Node u = insert(children[j++].next, key, value, height - <span class="number">1</span>);</span><br><span class="line">				<span class="comment">// 如果下一层没有进行过分裂操作,直接返回null</span></span><br><span class="line">				<span class="keyword">if</span> (u == <span class="keyword">null</span>)</span><br><span class="line">					<span class="keyword">return</span> <span class="keyword">null</span>;	</span><br><span class="line">				t.key = u.children[<span class="number">0</span>].key;</span><br><span class="line">				t.next = u;</span><br><span class="line">				<span class="keyword">break</span>;</span><br><span class="line">			&#125;</span><br><span class="line">		&#125;</span><br><span class="line">	&#125;</span><br><span class="line"></span><br><span class="line">	<span class="comment">// 将j之后的元素全部右移(为了腾出j的插入位置)</span></span><br><span class="line">	<span class="keyword">for</span> (<span class="keyword">int</span> i = children_length; i &gt; j; i--) &#123;</span><br><span class="line">		children[i] = children[i - <span class="number">1</span>];</span><br><span class="line">	&#125;</span><br><span class="line">	</span><br><span class="line">	children[j] = t;</span><br><span class="line">	x.children_length++;</span><br><span class="line">	<span class="keyword">if</span> (x.children_length &lt; M)</span><br><span class="line">		<span class="keyword">return</span> <span class="keyword">null</span>;</span><br><span class="line">	<span class="keyword">else</span></span><br><span class="line">		<span class="keyword">return</span> split(x); <span class="comment">// 如果空间已满,进行分裂</span></span><br><span class="line">&#125;	</span><br><span class="line"></span><br><span class="line"> <span class="comment">// 将x分裂为两个含有new_length对键的节点</span></span><br><span class="line"><span class="function"><span class="keyword">private</span> Node <span class="title">split</span><span class="params">(Node x)</span> </span>&#123;</span><br><span class="line">	<span class="keyword">int</span> new_length = M / <span class="number">2</span>;</span><br><span class="line">	Node t = <span class="keyword">new</span> Node(new_length);</span><br><span class="line">	x.children_length = new_length;</span><br><span class="line">	<span class="keyword">for</span> (<span class="keyword">int</span> j = <span class="number">0</span>; j &lt; new_length; j++)</span><br><span class="line">		t.children[j] = x.children[new_length + j];</span><br><span class="line">	<span class="keyword">return</span> t;</span><br><span class="line">&#125;	</span><br></pre></td></tr></table></figure>

<h3 id="参考文献"><a href="#参考文献" class="headerlink" title="参考文献"></a>参考文献</h3><hr>
<ul>
<li><a target="_blank" rel="noopener" href="http://algs4.cs.princeton.edu/home/">Algorithms, 4th Edition by Robert Sedgewick and Kevin Wayne</a></li>
</ul>
<ul>
<li><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/B-tree">B-tree - Wikipedia</a></li>
</ul>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh">
    <link itemprop="mainEntityOfPage" href="https://suyuhuan.gitee.io/yuwanzi.io/2017/08/06/2017-08-06-DigestHttps/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/yuwanzi.io/images/avatar.gif">
      <meta itemprop="name" content="玉丸子">
      <meta itemprop="description" content="这里是玉丸子的个人博客,与你一起发现更大的世界。">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="玉丸子 | Blog">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/yuwanzi.io/2017/08/06/2017-08-06-DigestHttps/" class="post-title-link" itemprop="url">揭秘HTTPS的"秘密"</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">Veröffentlicht am</span>

      <time title="Erstellt: 2017-08-06 18:00:00" itemprop="dateCreated datePublished" datetime="2017-08-06T18:00:00+08:00">2017-08-06</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">Bearbeitet am</span>
        <time title="Geändert am: 2020-11-07 08:58:17" itemprop="dateModified" datetime="2020-11-07T08:58:17+08:00">2020-11-07</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">in</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E7%BD%91%E7%BB%9C/" itemprop="url" rel="index"><span itemprop="name">网络</span></a>
        </span>
          . 
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/yuwanzi.io/categories/%E7%BD%91%E7%BB%9C/http/" itemprop="url" rel="index"><span itemprop="name">http</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <p>在说<code>https</code>之前,我们先了解一下<code>http</code>,以及为什么要使用<code>https</code>.</p>
<p><code>http(Hyper Text Transfer  Protocol)</code>超文本传输协议是一种用于分布式、协作式和超媒体信息系统的应用层协议,它是<code>TCP/IP</code>的上层协议,同时它也是万维网(万维网不等同于互联网,它只是基于互联网的一个服务)的数据通信的基础.</p>
<p><code>http</code>协议是客户端浏览器与其他程序或<code>Web</code>服务器之间交互的应用层通讯协议.但它也有一个致命的缺点:<strong><code>http</code>协议是明文传输协议</strong>,在传输信息的过程中并没有进行任何加密,通信的双方也没有任何的认证,这是非常不安全的,如果在通信过程中被中间人进行劫持、监听、篡改,会造成个人隐私泄露等严重的安全问题.</p>
<p>举一个现实中的例子来说,假设小李要给小张寄信,如果信件在运输的过程中没有任何安全保护,那么很可能会被邮递员(也就是中间人)窃取其中的内容,甚至于修改内容.</p>
<p><code>https</code>就是用于解决这样的安全问题的,它的全称为<code>Hypertext Transfer Protocol Secure</code>,它在<code>http</code>的基础上添加了<code>SSL(安全套接字层)</code>层来保证传输数据的安全问题.</p>
<p><img src="http://wx2.sinaimg.cn/mw690/63503acbly1fia1cscjslj20lb0a7443.jpg"></p>
<blockquote>
<p>本文作者为: <a target="_blank" rel="noopener" href="https://github.com/SylvanasSun">SylvanasSun</a>.转载请务必将下面这段话置于文章开头处(保留超链接).<br>本文转发自<a target="_blank" rel="noopener" href="https://sylvanassun.github.io/">SylvanasSun Blog</a>,原文链接: <a target="_blank" rel="noopener" href="https://sylvanassun.github.io/2017/08/06/2017-08-06-DigestHttps/">https://sylvanassun.github.io/2017/08/06/2017-08-06-DigestHttps/</a></p>
</blockquote>
<p><code>https</code>提供了端对端的加密,而且不仅对数据进行了加密,还对数据完整性提供了保护.不过在讲解<code>https</code>的加密方式之前,我们需要先了解一下加密算法.</p>
<h3 id="对称加密"><a href="#对称加密" class="headerlink" title="对称加密"></a>对称加密</h3><hr>
<p>对称加密的基本思想是: 通信双方使用同一个密钥(或者是两个可以简单地互相推算的密钥)来对明文进行加密与解密.</p>
<p>常见的对称加密算法有DES、3DES、AES、Blowfish、IDEA、RC5、RC6.</p>
<p><img src="http://wx3.sinaimg.cn/mw690/63503acbly1fia3vwjvcdj20j208ydg4.jpg"></p>
<p>对称加密看起来很美好,但是密钥要怎么发送过去呢?如果直接发送过去,被中间人截获了密钥岂不是白费工夫.</p>
<h3 id="非对称加密"><a href="#非对称加密" class="headerlink" title="非对称加密"></a>非对称加密</h3><hr>
<p>非对称加密也叫公开密钥加密,它使用了两个密钥,一个为公钥,一个为私钥,当一个用作于加密的时候,另一个则用作解密.</p>
<p>这两个密钥就算被其他人知道了其中一个也不能凭借它计算出另一个密钥,所以可以公开其中一个密钥(也就是公钥),不公开的密钥为私钥.</p>
<p><img src="http://wx2.sinaimg.cn/mw690/63503acbly1fia4nut76kj20l50cht97.jpg"></p>
<p>如果服务器想发送消息给客户端,只需要用客户端的公钥加密,然后客户端用它自己的私钥进行解密.</p>
<p>常见的非对称加密算法有RSA、DSA、ECDSA、 DH、ECDHE.</p>
<p>我们以<code>DH</code>算法为例,了解一下非对称加密的魅力.</p>
<p><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/13/Diffie-Hellman-Schl%C3%BCsselaustausch.svg/800px-Diffie-Hellman-Schl%C3%BCsselaustausch.svg.png"></p>
<ol>
<li><p><code>Alice</code>要与<code>Bob</code>进行通信,他们协定了一组可以公开的质数$p=23$,$g=5$.</p>
</li>
<li><p><code>Alice</code>选择了一个不公开的秘密数$a=6$,并计算$A = {g^a} ; {mod} ; {p} = {5^6} ; {mod} ; {23}  = 8$并发送给<code>Bob</code>.</p>
</li>
<li><p><code>Bob</code>选择了一个不公开的秘密数$b=15$,并计算$B = {g^b} ; {mod} ; {p} = {5^{15}} ; {mod} ; {23} = 19$并发送给<code>Alice</code></p>
</li>
<li><p><code>Alice</code> 计算$S = {B^a} ; {mod} ; {p}  = {19^6} ; {mod} ; {23} = 2$</p>
</li>
<li><p><code>Bob</code>计算$S = {A^b} ;  {mod} ; {p} = {8^{15}} ; {mod} ; {23} = 2$</p>
</li>
<li><p><code>Alice</code>与<code>Bob</code>得到了同样的值,因此${g^{ab}} ; {mod} ; {p} = {g^{ba}} ; {mod} ; {p}$</p>
</li>
</ol>
<h3 id="对称加密-非对称加密"><a href="#对称加密-非对称加密" class="headerlink" title="对称加密+非对称加密"></a>对称加密+非对称加密</h3><hr>
<p>尽管非对称加密如此奇妙,但它加解密的效率比对称加密要慢多了.那我们就将对称加密与非对称加密结合起来,取其精华,去其槽粕.</p>
<p>方法很简单,其中一方先自己生成一个对称加密密钥,然后通过非对称加密的方式来发送这个密钥,这样双方之后的通信就可以用对称加密这种高效率的算法进行加解密了.</p>
<h3 id="Certificate-Authority"><a href="#Certificate-Authority" class="headerlink" title="Certificate Authority"></a>Certificate Authority</h3><hr>
<p>对称加密与非对称加密结合使用的方法虽然能够保证了通信过程的安全,但也引发了如下问题: </p>
<ul>
<li>客户端要如何获取到服务器的公钥?</li>
</ul>
<ul>
<li>如果公钥在发送过程被中间人拦截,然后中间人发送自己的公钥给客户端,客户端该如何确认?</li>
</ul>
<p>解决方法依是通过一个权威的<code>CA(Certificate Authority)</code>证书中心,它来负责颁发证书,这个证书包含了如下等内容: </p>
<ul>
<li>证书的发布机构.</li>
</ul>
<ul>
<li>证书的有效期</li>
</ul>
<ul>
<li>公钥</li>
</ul>
<ul>
<li>证书所有人</li>
</ul>
<ul>
<li>数字签名</li>
</ul>
<p>数字签名是用来验证数据完整性的,首先将公钥与个人信息用一个<code>Hash</code>算法生成一个消息摘要,<code>Hash</code>算法是不可逆的,且只要内容发生变化,那生成的消息摘要将会截然不同.然后<code>CA</code>再用它的私钥对消息摘要加密,最终形成数字签名.</p>
<p>当客户端接收到证书时,只需要用同样的<code>Hash</code>算法再次生成一个消息摘要,然后用<code>CA</code>的公钥对证书进行解密,之后再对比两个消息摘要就能知道数据有没有被篡改过了.</p>
<p>那么<code>CA</code>的公钥又要从哪里来呢?这似乎陷入了一个鸡生蛋,蛋生鸡的悖论,其实<code>CA</code>也有证书来证明自己,而且<code>CA</code>证书的信用体系就像一棵树的结构,上层节点是信用高的<code>CA</code>同时它也会对底层的<code>CA</code>做信用背书,操作系统中已经内置了一些根证书,所以相当于你已经自动信任了它们(需要注意误安装一些非法或不安全的证书).</p>
<h3 id="Https的交互过程"><a href="#Https的交互过程" class="headerlink" title="Https的交互过程"></a>Https的交互过程</h3><hr>
<p><img src="http://wx2.sinaimg.cn/mw690/63503acbly1fia7yy31rdj20in0o7wfy.jpg"></p>
<ul>
<li>浏览器对服务器发送了一次请求.</li>
</ul>
<ul>
<li>服务器发送证书.</li>
</ul>
<ul>
<li>浏览器读取证书中的所有人,有效期等信息并进行校验.</li>
</ul>
<ul>
<li>浏览器查找操作系统中内置的已经信任的根证书,并对服务器发来的证书进行验证.</li>
</ul>
<ul>
<li>如果找不到,浏览器报错,服务器发来的证书是不可信任的.</li>
</ul>
<ul>
<li>如果找到,浏览器会从操作系统中取出<code>CA</code>的公钥,然后对服务器发来的证书中的数字签名进行解密.</li>
</ul>
<ul>
<li>浏览器使用相同的<code>Hash</code>算法计算出消息摘要,然后对数字签名中的消息摘要进行校对.</li>
</ul>
<ul>
<li>如果结果一致,证书合法.</li>
</ul>
<ul>
<li>之后浏览器就可以生成对称加密的密钥然后用非对称加密的方式发送给服务器,之后的通信就都是安全的了.</li>
</ul>
<h3 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h3><hr>
<p>现在国内外的大型网站基本都已经全站启用了<code>Https</code>,虽然相对于<code>Http</code>多了许多用于加密的流程,但为了数据的安全这点牺牲是必要的,<code>Https</code>也将是未来互联网的发展趋势.</p>
<h3 id="参考文献"><a href="#参考文献" class="headerlink" title="参考文献"></a>参考文献</h3><hr>
<ul>
<li><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/HTTPS">HTTPS - Wikipedia</a></li>
</ul>
<ul>
<li><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Public-key_cryptography">Public-key cryptography - Wikipedia</a></li>
</ul>
<ul>
<li><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Diffie%E2%80%93Hellman_key_exchange">Diffie–Hellman key exchange - Wikipedia</a></li>
</ul>
<ul>
<li><a target="_blank" rel="noopener" href="https://mp.weixin.qq.com/s?__biz=MzAxOTc0NzExNg==&mid=2665513779&idx=1&sn=a1de58690ad4f95111e013254a026ca2&chksm=80d67b70b7a1f26697fa1626b3e9830dbdf4857d7a9528d22662f2e43af149265c4fd1b60024#rd">一个故事讲完https</a></li>
</ul>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>




  <nav class="pagination">
    <a class="extend prev" rel="prev" href="/yuwanzi.io/"><i class="fa fa-angle-left" aria-label="Vorherige Seite"></i></a><a class="page-number" href="/yuwanzi.io/">1</a><span class="page-number current">2</span><a class="page-number" href="/yuwanzi.io/page/3/">3</a><span class="space">&hellip;</span><a class="page-number" href="/yuwanzi.io/page/7/">7</a><a class="extend next" rel="next" href="/yuwanzi.io/page/3/"><i class="fa fa-angle-right" aria-label="Nächste Seite"></i></a>
  </nav>


<script>
  window.addEventListener('tabs:register', () => {
    let { activeClass } = CONFIG.comments;
    if (CONFIG.comments.storage) {
      activeClass = localStorage.getItem('comments_active') || activeClass;
    }
    if (activeClass) {
      const activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
      if (activeTab) {
        activeTab.click();
      }
    }
  });
  if (CONFIG.comments.storage) {
    window.addEventListener('tabs:click', event => {
      if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
      const commentClass = event.target.classList[1];
      localStorage.setItem('comments_active', commentClass);
    });
  }
</script>
</div>
  </main>

  <footer class="footer">
    <div class="footer-inner">


<div class="copyright">
  &copy; 
  <span itemprop="copyrightYear">2021</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">玉丸子</span>
</div>
  <div class="powered-by">Erstellt mit  <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> & <a href="https://theme-next.js.org/muse/" class="theme-link" rel="noopener" target="_blank">NexT.Muse</a>
  </div>

    </div>
  </footer>

  
  <script src="//cdn.jsdelivr.net/npm/animejs@3.2.1/lib/anime.min.js"></script>
<script src="/yuwanzi.io/js/utils.js"></script><script src="/yuwanzi.io/js/motion.js"></script><script src="/yuwanzi.io/js/schemes/muse.js"></script><script src="/yuwanzi.io/js/next-boot.js"></script>

  






  





</body>
</html>
